| Model Group | Model Name | Link | Viable | Dependencies | Input Requirements | Additional Preprocessing/Data Formats | Data Type | Complexity | Training Time |
|---|---|---|---|---|---|---|---|---|---|
| Hybrid Models | Prophet | Link | Yes | Python Native (Stan backend) | 2D shape (timestamp, data) | Dataframe with 'ds' (date) and 'y' (value) columns | Univariate | Low | Moderate |
| Hybrid Models | Flow Forecast | Link | Yes | PyTorch | 3D shape (timestamp, data, features) | Requires data normalization, handles missing values | Multivariate | Moderate | High |
| Hybrid Models | DeepTime | Link | Yes | ? | 3D shape (timestamp, data, features) | Requires data normalization, handles missing values | Multivariate | High | High |
| Machine Learning Models | XGBoost | Link | Yes | Python Native | 2D shape (data, features) | Requires numerical input, handles missing values | Both | Moderate | Moderate |
| Machine Learning Models | LightGBM | Link | Yes | Python Native | 2D shape (data, features) | Requires numerical input, handles missing values | Both | Moderate | Moderate |
| Machine Learning Models | AutoTS | Link | Yes | ? | 2D shape (timestamp, data) | Requires a univariate time series | Univariate | Low | Low |
| Machine Learning Models | PyCaret | Link | Yes | Python Native (Multiple backend models) | 2D shape (data, features) | Automatic preprocessing, handles categorical features and missing values | Both | Low | Low |
| Neural Network Models | LSTM | Link | Yes | TensorFlow, Keras, PyTorch (depends on implementation) | 3D shape (timestamp, data, features) | Requires data normalization, sequences of fixed length | Both | High | High |
| Neural Network Models | TCN | Link | Yes | TensorFlow, Keras, PyTorch (depends on implementation) | 3D shape (timestamp, data, features) | Requires data normalization, sequences of fixed length | Both | High | High |
| Neural Network Models | Temporal Fusion Transformer | Link | Yes | TensorFlow | 3D shape (timestamp, data, features) | Requires data normalization, handles missing values | Multivariate | High | High |
| Traditional Statistical Models | GARCH | Link | Yes | Python Native (often via libraries like arch) |
2D shape (timestamp, data) | Requires stationary data, handles missing values | Univariate | Low | Low |
| Traditional Statistical Models | Linear Regression | Link | Yes | Python Native (often via libraries like scikit-learn) |
2D shape (data, features) | Requires numerical input | Both | Low | Low |
| Traditional Statistical Models | Auto-Arima | Link | Yes | Python Native (often via libraries like pmdarima) |
2D shape (timestamp, data) | Requires stationary data | Univariate | Moderate | Moderate |
| Traditional Statistical Models | Markov Chains | Link | Yes | Python Native | 2D shape (states, transition probabilities) | Requires state transition matrix | Both | Low | Low |
#!pip install requests
#!pip install numpy
#!pip install pandas
#!pip install scipy
#!pip install seaborn
#!pip install matplotlib
#!pip install cryptocmd
#!pip install prettytable
#!pip install joblib
#!pip install scikit-learn
#!pip install lightgbm
#!pip install xgboost
#!pip install statsmodels
#!pip install pmdarima
#!pip install arch
#!pip install tensorflow
#!pip install keras-tuner
#!pip install bokeh
#!pip install solenium
# Standard library imports
import os
import time
import sys
import warnings
import logging
import pickle
from typing import Optional
from datetime import datetime, timedelta
from math import pi
# Third-party imports
import requests
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
import matplotlib.pyplot as plt
from cryptocmd import CmcScraper
from prettytable import PrettyTable
import joblib
from joblib import dump, load
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.stattools import adfuller, grangercausalitytests
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.tsatools import detrend
from pmdarima import auto_arima
from arch import arch_model
from pandas.tseries.holiday import USFederalHolidayCalendar
from scipy.signal import detrend
import xgboost as xgb
#from selenium import webdriver
# Machine Learning Libraries
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
# Deep Learning Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import (Dense, Dropout, LSTM, TimeDistributed, Conv1D, MaxPooling1D, Flatten,
ConvLSTM2D, BatchNormalization, GRU)
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1, l2, l1_l2
from keras_tuner import HyperModel, RandomSearch
from keras.models import Sequential
from keras.layers import LSTM, Dropout, Dense, BatchNormalization
from keras.regularizers import l1, l2, l1_l2
from keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras_tuner.tuners import BayesianOptimization
from tensorflow.keras.layers import SimpleRNN
# Plotting and Visualization
from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import (HoverTool, ColumnDataSource, WheelZoomTool, Span, Range1d,
FreehandDrawTool, MultiLine, NumeralTickFormatter, Button, CustomJS)
from bokeh.layouts import column, row
from bokeh.io import curdoc, export_png
from bokeh.models.widgets import CheckboxGroup
from bokeh.themes import Theme
from bokeh.io.export import get_screenshot_as_png
from IPython.core.display import display, HTML
import hashlib
# Other settings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.3f}'.format)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] - %(message)s')
logger = logging.getLogger(__name__)
display(HTML("<style>.bk-root { margin-left: auto; margin-right: auto; }</style>"))
print("Hello Everyone")
Hello Everyone
# Standard library imports
import os
import sys
import time
import json
import warnings
import logging
import pickle
import hashlib
from abc import abstractmethod
from datetime import datetime, timedelta
from math import pi
from typing import Optional
# Third-party imports
import requests
import numpy as np
import pandas as pd
import scipy.stats as stats
from scipy.signal import detrend
from scipy.stats import jarque_bera, kstest
#from scipy.stats import boxcox, invboxcox # Uncomment if needed
import seaborn as sns
from cryptocmd import CmcScraper
from prettytable import PrettyTable
from pmdarima import auto_arima
from arch import arch_model
# Machine Learning Libraries
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, QuantileTransformer, PowerTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import xgboost as xgb
from statsmodels.tsa.api import seasonal_decompose
from statsmodels.tsa.stattools import adfuller, grangercausalitytests, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
# Deep Learning Libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import (Dense, Dropout, LSTM, TimeDistributed, Conv1D, MaxPooling1D, Flatten,
ConvLSTM2D, BatchNormalization, GRU, Bidirectional, Attention, Input,
Reshape, GlobalAveragePooling1D, GlobalMaxPooling1D)
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l1, l2, l1_l2
from keras_tuner import HyperModel, RandomSearch, BayesianOptimization
# Plotting and Visualization
import matplotlib.pyplot as plt
from bokeh.plotting import figure, show, output_notebook, save
from bokeh.models import (HoverTool, ColumnDataSource, WheelZoomTool, Span, Range1d,
FreehandDrawTool, MultiLine, NumeralTickFormatter, Button, CustomJS)
from bokeh.layouts import column, row
from bokeh.io import curdoc, export_png
from bokeh.models.widgets import CheckboxGroup
from bokeh.themes import Theme
from IPython.core.display import display, HTML
from joblib import dump, load
# Other settings
warnings.filterwarnings('ignore')
pd.set_option('display.float_format', '{:.3f}'.format)
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] - %(message)s')
logger = logging.getLogger(__name__)
display(HTML("<style>.bk-root { margin-left: auto; margin-right: auto; }</style>"))
print("Hello Everyone")
Hello Everyone
class CryptoData:
"""
The CryptoData class is responsible for fetching and validating cryptocurrency data.
It provides methods to fetch raw data, validate its integrity, and format it for display.
Attributes:
- EXPECTED_COLUMNS: A set of expected columns in the fetched data.
- crypto_symbol: The cryptocurrency symbol to fetch.
- retries: The maximum number of data fetch retries.
- backoff_factor: The exponential backoff factor for retries.
"""
EXPECTED_COLUMNS = {'Date', 'Open', 'High', 'Low', 'Close', 'Market Cap', 'Volume'}
def __init__(self, crypto_symbol: str, retries: int = 5, backoff_factor: float = 0.3):
"""Initializes the class with the given cryptocurrency symbol."""
logger.info("Initializing CryptoData class.")
self.crypto_symbol = crypto_symbol
self.retries = retries
self.backoff_factor = backoff_factor
logger.info("CryptoData class initialized.")
def _validate_data(self, df: pd.DataFrame) -> Optional[pd.DataFrame]:
"""Validates the integrity of the fetched data."""
logger.info("Starting data validation.")
if df.isnull().any().any():
raise ValueError("The fetched data contains missing values.")
if not self.EXPECTED_COLUMNS.issubset(df.columns):
raise ValueError("Some expected columns are missing in the data.")
for col in df.columns:
if col != 'Date' and not pd.api.types.is_numeric_dtype(df[col]):
raise ValueError(f"Column {col} is not of numeric type in the fetched data.")
logger.info("Data validation completed.")
return df
def _fetch_cryptocmd_data(self) -> pd.DataFrame:
"""Fetches cryptocurrency data with retries and exponential backoff."""
logger.info(f"Fetching data for {self.crypto_symbol}.")
for retry in range(self.retries):
try:
scraper = CmcScraper(self.crypto_symbol)
df = scraper.get_dataframe()
return self._validate_data(df)
except Exception as e:
logger.error(f'An error occurred while fetching data: {e}')
sleep_time = self.backoff_factor * (2 ** retry)
time.sleep(sleep_time)
logger.info(f'Retrying... (Attempt {retry + 1}/{self.retries})')
else:
raise Exception('Max retries reached. Could not fetch the data.')
logger.info(f"Data fetched successfully for {self.crypto_symbol}.")
def get_cryptocmd_data(self) -> pd.DataFrame:
"""Fetches and returns the cryptocurrency data."""
logger.info(f"Getting {self.crypto_symbol} data.")
df = self._fetch_cryptocmd_data()
df.set_index('Date', inplace=True)
df.index = pd.to_datetime(df.index)
df = df.iloc[::-1] # Reversing the DataFrame
logger.info(f"Data obtained successfully for {self.crypto_symbol}.")
return df.round(2)
@staticmethod
def _format_monetary_value(value: float) -> str:
"""Formats a monetary value to a string."""
return "${:,.2f}".format(value)
@staticmethod
def _format_volume_value(value: float) -> str:
"""Formats a volume value to a string."""
if value > 1e9:
return "{:.2f}B".format(value/1e9)
elif value > 1e6:
return "{:.2f}M".format(value/1e6)
else:
return "{:,.2f}".format(value)
def get_display_data(self) -> pd.DataFrame:
"""Fetches and formats the cryptocurrency data for display."""
logger.info(f"Formatting display data for {self.crypto_symbol}.")
display_df = self.get_cryptocmd_data().copy()
monetary_columns = ['Open', 'High', 'Low', 'Close']
display_df[monetary_columns] = display_df[monetary_columns].applymap(self._format_monetary_value)
volume_like_columns = ['Volume', 'Market Cap']
display_df[volume_like_columns] = display_df[volume_like_columns].applymap(self._format_volume_value)
logger.info(f"Display data formatted successfully for {self.crypto_symbol}.")
return display_df
# Initialize CryptoData class for different cryptocurrencies
crypto_symbols = ['BTC']
crypto_data_objects = {}
for symbol in crypto_symbols:
crypto_data_objects[symbol] = CryptoData(symbol)
# Fetch and save data
data_c = {}
display_data = {}
for symbol, crypto_data_object in crypto_data_objects.items():
data_c[symbol] = crypto_data_object.get_cryptocmd_data()
display_data[symbol] = crypto_data_object.get_display_data()
# Now, data['BTC'], data['ETH'], etc. contain the data
# And display_data['BTC'], display_data['ETH'], etc. contain the display data
2023-10-02 12:55:59,208 [INFO] - Initializing CryptoData class. 2023-10-02 12:55:59,210 [INFO] - CryptoData class initialized. 2023-10-02 12:55:59,213 [INFO] - Getting BTC data. 2023-10-02 12:55:59,214 [INFO] - Fetching data for BTC. 2023-10-02 12:56:01,328 [INFO] - Starting data validation. 2023-10-02 12:56:01,332 [INFO] - Data validation completed. 2023-10-02 12:56:01,343 [INFO] - Data obtained successfully for BTC. 2023-10-02 12:56:01,348 [INFO] - Formatting display data for BTC. 2023-10-02 12:56:01,349 [INFO] - Getting BTC data. 2023-10-02 12:56:01,350 [INFO] - Fetching data for BTC. 2023-10-02 12:56:01,762 [INFO] - Starting data validation. 2023-10-02 12:56:01,762 [INFO] - Data validation completed. 2023-10-02 12:56:01,777 [INFO] - Data obtained successfully for BTC. 2023-10-02 12:56:01,803 [INFO] - Display data formatted successfully for BTC.
data = data_c['BTC'].copy()
data
| Open | High | Low | Close | Volume | Market Cap | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2013-04-28 | 136.690 | 143.370 | 110.850 | 127.950 | 25956.840 | 1419007893.660 |
| 2013-04-29 | 134.440 | 147.490 | 134.000 | 144.540 | 0.000 | 1603768864.500 |
| 2013-04-30 | 144.000 | 146.930 | 134.050 | 139.000 | 0.000 | 1542813125.000 |
| 2013-05-01 | 139.000 | 139.890 | 107.720 | 116.990 | 0.000 | 1298954593.750 |
| 2013-05-02 | 116.380 | 125.600 | 92.280 | 105.210 | 0.000 | 1168517495.250 |
| ... | ... | ... | ... | ... | ... | ... |
| 2023-09-27 | 26209.500 | 26817.840 | 26111.460 | 26352.720 | 11718380997.240 | 513818515485.060 |
| 2023-09-28 | 26355.810 | 27259.500 | 26327.320 | 27021.550 | 14079002707.330 | 526883682824.270 |
| 2023-09-29 | 27024.840 | 27225.940 | 26721.760 | 26911.720 | 10396435377.430 | 524766460979.210 |
| 2023-09-30 | 26911.690 | 27091.800 | 26888.970 | 26967.920 | 5331172801.030 | 525885640459.760 |
| 2023-10-01 | 26967.400 | 28047.240 | 26965.090 | 27983.750 | 9503917434.430 | 545719315434.050 |
3809 rows × 6 columns
class CryptoDataAnalytics:
"""
This class is responsible for performing enhanced analytics on cryptocurrency data.
Attributes:
df (pd.DataFrame): The DataFrame containing the cryptocurrency data.
output_dir (str): The directory where analytics files will be saved.
"""
def __init__(self, crypto_data: pd.DataFrame):
logger.info("Initializing CryptoDataAnalytics class.")
self.df = crypto_data
self.output_dir = 'analytics_csv'
self._create_output_dir()
logger.info("CryptoDataAnalytics class initialized successfully.")
def _create_output_dir(self):
"""Create output directory if it doesn't exist."""
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
logger.info(f"Created output directory: {self.output_dir}")
def calculate_historical_volatility(self, column: str = 'Close', window: int = 30) -> pd.DataFrame:
"""Calculates historical volatility."""
logger.info("Initiating historical volatility calculation.")
if len(self.df) < window:
logger.error("Data length is less than the rolling window size. Cannot calculate volatility.")
raise ValueError("Insufficient data for volatility calculation.")
log_ret = np.log(self.df[column] / self.df[column].shift(1))
volatility = log_ret.rolling(window=window).std()
logger.info("Historical volatility calculation successful.")
return pd.DataFrame(volatility, columns=['Historical Volatility'])
def perform_time_analysis(self, freq: str):
"""Performs time-based analysis."""
logger.info(f"Initiating {freq}-based time analysis.")
data = self.df.resample(freq).agg({'Close': ['last', 'mean', 'max', 'min'], 'Open': 'first'})
data.columns = data.columns.map('_'.join).str.strip('_')
data = self.calculate_price_variation(data)
# Reorder columns
ordered_columns = ['Close_mean', 'Close_max', 'Close_min', 'Close_last', 'Open_first', 'variation_$_abs', 'variation_%_rel']
data = data[ordered_columns]
logger.info(f"{freq}-based time analysis successful.")
return data
def calculate_price_variation(self, data: pd.DataFrame):
"""Calculates price variation."""
logger.info("Initiating price variation calculation.")
data['variation_$_abs'] = data['Close_last'] - data['Open_first']
data['variation_%_rel'] = ((data['Close_last'] - data['Open_first']) / data['Open_first']) * 100
logger.info("Price variation calculation successful.")
return data
def retrieve_all_time_records(self):
"""Retrieves all-time price records."""
logger.info("Initiating retrieval of all-time records.")
all_time_high = self.df['Close'].max()
all_time_low = self.df['Close'].min()
all_time_high_date = self.df['Close'].idxmax().strftime('%Y-%m-%d')
all_time_low_date = self.df['Close'].idxmin().strftime('%Y-%m-%d')
logger.info("All-time records retrieval successful.")
return all_time_high, all_time_low, all_time_high_date, all_time_low_date
def perform_and_save_all_analyses(self):
"""Performs all analyses and saves them to Excel files."""
logger.info("Initiating all analyses.")
self.save_analysis_to_excel(self.perform_time_analysis('Y'), 'yearly_data.xlsx')
self.save_analysis_to_excel(self.perform_time_analysis('M'), 'monthly_data.xlsx')
self.save_analysis_to_excel(self.perform_time_analysis('W'), 'weekly_data.xlsx')
logger.info("All analyses have been successfully performed and saved.")
def save_analysis_to_excel(self, analysis: pd.DataFrame, filename: str):
"""Saves the given DataFrame to an Excel file in the output directory."""
filepath = os.path.join(self.output_dir, filename)
analysis.to_excel(filepath)
logger.info(f"Analysis saved to {filepath}.")
analytics = CryptoDataAnalytics(data)
# Retrieve and display all-time records
all_time_high, all_time_low, all_time_high_date, all_time_low_date = analytics.retrieve_all_time_records()
print(f"All Time High: {all_time_high} on {all_time_high_date}")
print(f"All Time Low: {all_time_low} on {all_time_low_date}")
# Run all analyses and save them
analytics.perform_and_save_all_analyses()
# Additionally, display the DataFrames
yearly_data = analytics.perform_time_analysis('Y')
monthly_data = analytics.perform_time_analysis('M')
weekly_data = analytics.perform_time_analysis('W')
display(yearly_data)
display(monthly_data)
display(weekly_data)
2023-10-02 12:56:01,887 [INFO] - Initializing CryptoDataAnalytics class. 2023-10-02 12:56:01,887 [INFO] - CryptoDataAnalytics class initialized successfully. 2023-10-02 12:56:01,887 [INFO] - Initiating retrieval of all-time records. 2023-10-02 12:56:01,887 [INFO] - All-time records retrieval successful. 2023-10-02 12:56:01,887 [INFO] - Initiating all analyses. 2023-10-02 12:56:01,887 [INFO] - Initiating Y-based time analysis. 2023-10-02 12:56:01,925 [INFO] - Initiating price variation calculation. 2023-10-02 12:56:01,928 [INFO] - Price variation calculation successful. 2023-10-02 12:56:01,928 [INFO] - Y-based time analysis successful.
All Time High: 67566.83 on 2021-11-08 All Time Low: 68.43 on 2013-07-05
2023-10-02 12:56:02,637 [INFO] - Analysis saved to analytics_csv\yearly_data.xlsx. 2023-10-02 12:56:02,637 [INFO] - Initiating M-based time analysis. 2023-10-02 12:56:02,652 [INFO] - Initiating price variation calculation. 2023-10-02 12:56:02,665 [INFO] - Price variation calculation successful. 2023-10-02 12:56:02,670 [INFO] - M-based time analysis successful. 2023-10-02 12:56:02,731 [INFO] - Analysis saved to analytics_csv\monthly_data.xlsx. 2023-10-02 12:56:02,733 [INFO] - Initiating W-based time analysis. 2023-10-02 12:56:02,748 [INFO] - Initiating price variation calculation. 2023-10-02 12:56:02,754 [INFO] - Price variation calculation successful. 2023-10-02 12:56:02,756 [INFO] - W-based time analysis successful. 2023-10-02 12:56:02,949 [INFO] - Analysis saved to analytics_csv\weekly_data.xlsx. 2023-10-02 12:56:02,949 [INFO] - All analyses have been successfully performed and saved. 2023-10-02 12:56:02,949 [INFO] - Initiating Y-based time analysis. 2023-10-02 12:56:02,968 [INFO] - Initiating price variation calculation. 2023-10-02 12:56:02,973 [INFO] - Price variation calculation successful. 2023-10-02 12:56:02,976 [INFO] - Y-based time analysis successful. 2023-10-02 12:56:02,977 [INFO] - Initiating M-based time analysis. 2023-10-02 12:56:02,977 [INFO] - Initiating price variation calculation. 2023-10-02 12:56:02,977 [INFO] - Price variation calculation successful. 2023-10-02 12:56:02,992 [INFO] - M-based time analysis successful. 2023-10-02 12:56:02,992 [INFO] - Initiating W-based time analysis. 2023-10-02 12:56:03,012 [INFO] - Initiating price variation calculation. 2023-10-02 12:56:03,012 [INFO] - Price variation calculation successful. 2023-10-02 12:56:03,012 [INFO] - W-based time analysis successful.
| Close_mean | Close_max | Close_min | Close_last | Open_first | variation_$_abs | variation_%_rel | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 2013-12-31 | 257.449 | 1151.170 | 68.430 | 754.010 | 136.690 | 617.320 | 451.620 |
| 2014-12-31 | 527.237 | 953.290 | 310.740 | 320.190 | 754.970 | -434.780 | -57.589 |
| 2015-12-31 | 272.453 | 465.320 | 178.100 | 430.570 | 320.430 | 110.140 | 34.373 |
| 2016-12-31 | 568.492 | 975.920 | 364.330 | 963.740 | 430.720 | 533.020 | 123.751 |
| 2017-12-31 | 4006.034 | 19497.400 | 777.760 | 14156.400 | 963.660 | 13192.740 | 1369.024 |
| 2018-12-31 | 7572.299 | 17527.000 | 3236.760 | 3742.700 | 14112.200 | -10369.500 | -73.479 |
| 2019-12-31 | 7395.247 | 13016.230 | 3399.470 | 7193.600 | 3746.710 | 3446.890 | 91.998 |
| 2020-12-31 | 11116.378 | 29001.720 | 4970.790 | 29001.720 | 7194.890 | 21806.830 | 303.088 |
| 2021-12-31 | 47436.932 | 67566.830 | 29374.150 | 46306.450 | 28994.010 | 17312.440 | 59.710 |
| 2022-12-31 | 28197.877 | 47686.810 | 15787.280 | 16547.500 | 46311.740 | -29764.240 | -64.269 |
| 2023-12-31 | 26359.197 | 31476.050 | 16625.080 | 27983.750 | 16547.910 | 11435.840 | 69.107 |
| Close_mean | Close_max | Close_min | Close_last | Open_first | variation_$_abs | variation_%_rel | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 2013-04-30 | 137.163 | 144.540 | 127.950 | 139.000 | 136.690 | 2.310 | 1.690 |
| 2013-05-31 | 119.993 | 133.480 | 97.750 | 129.000 | 139.000 | -10.000 | -7.194 |
| 2013-06-30 | 107.761 | 129.300 | 94.650 | 96.610 | 128.820 | -32.210 | -25.004 |
| 2013-07-31 | 90.512 | 107.990 | 68.430 | 106.090 | 97.510 | 8.580 | 8.799 |
| 2013-08-31 | 113.905 | 135.350 | 102.800 | 135.350 | 106.210 | 29.140 | 27.436 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-06-30 | 27763.199 | 30695.470 | 25124.680 | 30477.250 | 27218.410 | 3258.840 | 11.973 |
| 2023-07-31 | 30057.470 | 31476.050 | 29176.920 | 29230.110 | 30471.850 | -1241.740 | -4.075 |
| 2023-08-31 | 27852.792 | 29765.490 | 25931.470 | 25931.470 | 29230.870 | -3299.400 | -11.287 |
| 2023-09-30 | 26306.137 | 27211.120 | 25162.660 | 26967.920 | 25934.020 | 1033.900 | 3.987 |
| 2023-10-31 | 27983.750 | 27983.750 | 27983.750 | 27983.750 | 26967.400 | 1016.350 | 3.769 |
127 rows × 7 columns
| Close_mean | Close_max | Close_min | Close_last | Open_first | variation_$_abs | variation_%_rel | |
|---|---|---|---|---|---|---|---|
| Date | |||||||
| 2013-04-28 | 127.950 | 127.950 | 127.950 | 127.950 | 136.690 | -8.740 | -6.394 |
| 2013-05-05 | 118.843 | 144.540 | 97.750 | 115.910 | 134.440 | -18.530 | -13.783 |
| 2013-05-12 | 113.926 | 117.200 | 111.500 | 115.000 | 115.980 | -0.980 | -0.845 |
| 2013-05-19 | 118.709 | 123.500 | 111.500 | 121.990 | 114.820 | 7.170 | 6.245 |
| 2013-05-26 | 127.733 | 133.480 | 122.000 | 133.480 | 122.500 | 10.980 | 8.963 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-09-03 | 26385.910 | 27727.390 | 25800.720 | 25969.570 | 26089.610 | -120.040 | -0.460 |
| 2023-09-10 | 25888.484 | 26240.190 | 25753.240 | 25832.230 | 25968.170 | -135.940 | -0.523 |
| 2023-09-17 | 26210.736 | 26608.690 | 25162.660 | 26534.190 | 25831.720 | 702.470 | 2.719 |
| 2023-09-24 | 26725.833 | 27211.120 | 26256.830 | 26256.830 | 26532.990 | -276.160 | -1.041 |
| 2023-10-01 | 26821.913 | 27983.750 | 26217.250 | 27983.750 | 26253.780 | 1729.970 | 6.589 |
545 rows × 7 columns
class CryptoAnalyticsVisual:
"""
The CryptoAnalyticsVisual class provides tools for cryptocurrency market analysis and visualization.
Attributes:
data (pd.DataFrame): Raw crypto data with 'Open', 'Close', 'High', 'Low', 'Volume'.
Methods:
_create_visualizations_directory: Creates directory for visualizations.
save_plot_to_file: Saves Bokeh plot to file.
calculate_macd, plot_macd_bokeh: Handles MACD calculation and plotting.
calculate_rsi, plot_rsi_bokeh: Handles RSI calculation and plotting.
calculate_bollinger_bands, plot_bollinger_bands_bokeh: Handles Bollinger Bands.
calculate_fibonacci_retracement, plot_fibonacci_retracement_bokeh: Handles Fibonacci retracement.
volume_analysis, plot_volume_analysis_bokeh: Handles volume analysis.
create_candlestick_chart, plot_trend_bokeh: Plots candlestick and trend data.
Example:
>>> df = pd.read_csv('crypto_data.csv')
>>> analytics = CryptoAnalyticsVisual(df)
>>> analytics.plot_macd_bokeh()
"""
def __init__(self, data: pd.DataFrame):
self.data = data
output_notebook()
curdoc().theme = 'dark_minimal'
self._create_visualizations_directory()
logger.info('CryptoAnalyticsVisual instance created and initialized.')
def _create_visualizations_directory(self):
"""Creates a directory for storing visualization assets."""
if not os.path.exists('visualizations_assets'):
os.makedirs('visualizations_assets')
logger.info("Created directory: visualizations_assets")
def save_plot_to_file(self, plot, filename: str, format: str = 'html'):
"""Saves plot to a file."""
full_path = os.path.join('visualizations_assets', filename)
if format == 'html':
save(plot, filename=full_path)
logger.info(f'Plot saved to file: {full_path}')
else:
logger.error('Unsupported file format: {}'.format(format))
def calculate_macd(self, short_window=12, long_window=26, signal_window=9):
"""Calculates MACD and signal line."""
short_ema = self.data['Close'].ewm(span=short_window, adjust=False).mean()
long_ema = self.data['Close'].ewm(span=long_window, adjust=False).mean()
macd_line = short_ema - long_ema
signal_line = macd_line.ewm(span=signal_window, adjust=False).mean()
logger.info(f'MACD calculated with short_window={short_window}, long_window={long_window}, and signal_window={signal_window}')
return macd_line, signal_line
def plot_macd_bokeh(self, display=True):
"""Plots MACD and signal line."""
macd_line, signal_line = self.calculate_macd()
source = ColumnDataSource(data=dict(x=self.data.index, y1=macd_line, y2=signal_line))
p = figure(width=1400, height=600, title="MACD Analysis", x_axis_type="datetime")
p.line(x='x', y='y1', source=source, legend_label="MACD Line", color="blue", alpha=0.8)
p.line(x='x', y='y2', source=source, legend_label="Signal Line", color="red", alpha=0.8)
hover = HoverTool(tooltips=[("Date", "@x{%F}"), ("MACD", "@y1"), ("Signal", "@y2")], formatters={"@x": "datetime"})
p.add_tools(hover)
logger.info('MACD plot displayed.')
if display:
show(p)
return p
def calculate_rsi(self, window=14):
"""Calculates the Relative Strength Index (RSI)."""
delta = self.data['Close'].diff()
gain = (delta.where(delta > 0, 0)).fillna(0)
loss = (-delta.where(delta < 0, 0)).fillna(0)
avg_gain = gain.rolling(window=window, min_periods=1).mean()
avg_loss = loss.rolling(window=window, min_periods=1).mean()
rs = avg_gain / avg_loss
rsi = 100 - (100 / (1 + rs))
logger.info(f'RSI calculated with window={window}')
return rsi
def plot_rsi_bokeh(self, display=True):
"""Plots the Relative Strength Index (RSI)."""
rsi = self.calculate_rsi()
source = ColumnDataSource(data=dict(x=self.data.index, y=rsi))
p = figure(width=1400, height=600, title="RSI Analysis", x_axis_type="datetime")
p.line(x='x', y='y', source=source, legend_label="RSI", color="green", alpha=0.8)
hover = HoverTool(tooltips=[("Date", "@x{%F}"), ("RSI", "@y")], formatters={"@x": "datetime"})
p.add_tools(hover)
p.add_layout(Span(location=70, dimension='width', line_color='red', line_width=1, line_dash='dashed'))
p.add_layout(Span(location=30, dimension='width', line_color='red', line_width=1, line_dash='dashed'))
logger.info('RSI plot displayed.')
if display:
show(p)
return p
def calculate_bollinger_bands(self, window=20, num_std=2):
"""Calculates upper and lower Bollinger Bands."""
rolling_mean = self.data['Close'].rolling(window=window).mean()
rolling_std = self.data['Close'].rolling(window=window).std()
upper_band = rolling_mean + (rolling_std * num_std)
lower_band = rolling_mean - (rolling_std * num_std)
logger.info(f'Bollinger Bands calculated with window={window} and num_std={num_std}')
return upper_band, lower_band
def plot_bollinger_bands_bokeh(self, display=True):
"""Plots Bollinger Bands and Close Price."""
upper_band, lower_band = self.calculate_bollinger_bands()
source = ColumnDataSource(data=dict(x=self.data.index, close=self.data['Close'], upper=upper_band, lower=lower_band))
p = figure(width=1400, height=600, title="Bollinger Bands Analysis", x_axis_type="datetime")
p.line(x='x', y='close', source=source, legend_label="Close Price", color="blue", alpha=0.8)
p.line(x='x', y='upper', source=source, legend_label="Upper Band", color="red", alpha=0.5)
p.line(x='x', y='lower', source=source, legend_label="Lower Band", color="green", alpha=0.5)
hover = HoverTool(tooltips=[("Date", "@x{%F}"), ("Close", "@close{$0,0.00} K"), ("Upper Band", "@upper{$0,0.00} K"), ("Lower Band", "@lower{$0,0.00} K")], formatters={"@x": "datetime"})
p.yaxis.formatter = NumeralTickFormatter(format="$0,0.00")
p.add_tools(hover)
logger.info('Bollinger Bands plot displayed.')
if display:
show(p)
return p
def calculate_fibonacci_retracement(self):
"""Calculates Fibonacci retracement levels."""
max_price = self.data['High'].max()
min_price = self.data['Low'].min()
diff = max_price - min_price
levels = [0.0, 0.236, 0.382, 0.5, 0.618, 0.786, 1.0]
retracement_levels = {level: (max_price - level * diff) for level in levels}
logger.info('Fibonacci retracement levels calculated.')
return retracement_levels
def plot_fibonacci_retracement_bokeh(self, display=True):
"""Plots Fibonacci retracement levels."""
retracement_levels = self.calculate_fibonacci_retracement()
source = ColumnDataSource(data=dict(x=self.data.index, close=self.data['Close']))
p = figure(width=1400, height=600, title="Fibonacci Retracement Levels", x_axis_type="datetime")
p.line(x='x', y='close', source=source, legend_label="Close Price", color="blue", alpha=0.8)
for level, price in retracement_levels.items():
p.add_layout(Span(location=price, dimension='width', line_color='red', line_width=1, line_dash='dashed'))
p.line([], [], line_color="red", legend_label=f'Level: {level}', line_dash='dashed')
hover = HoverTool(tooltips=[("Date", "@x{%F}"), ("Close Price", "@close{$0,0.00} K")], formatters={"@x": "datetime"})
p.yaxis.formatter = NumeralTickFormatter(format="$0,0.00")
p.add_tools(hover)
logger.info('Fibonacci retracement plot displayed.')
if display:
show(p)
return p
def volume_analysis(self):
"""Analyzes volume data and computes average volume over 30 days."""
volume = self.data['Volume'] / 1_000 # Convert to Thousands
avg_volume = volume.rolling(window=30).mean()
logger.info('Volume analysis completed.')
return volume, avg_volume
def plot_volume_analysis_bokeh(self, display=True):
"""Plots volume and 30-day average volume."""
volume, avg_volume = self.volume_analysis()
source = ColumnDataSource(data=dict(x=self.data.index, volume=volume, avg_volume=avg_volume))
p = figure(width=1400, height=600, title="Volume Analysis (in Thousands)", x_axis_type="datetime")
p.vbar(x='x', top='volume', source=source, width=0.9, legend_label="Volume", alpha=0.6, color="blue")
p.line(x='x', y='avg_volume', source=source, legend_label="30-Day Avg Volume", color="red", line_width=2)
hover = HoverTool(tooltips=[("Date", "@x{%F}"), ("Volume", "@volume{$0,0} K"), ("30-Day Avg Volume", "@avg_volume{$0,0} K")], formatters={"@x": "datetime"})
p.yaxis.formatter = NumeralTickFormatter(format="$0,0")
p.add_tools(hover)
logger.info('Volume analysis plot displayed.')
if display:
show(p)
return p
def create_candlestick_chart(self, time_period='last_month', ma_period=20, display=True):
"""
Creates a candlestick chart for the selected time period and moving average period.
"""
logger.info("Creating candlestick chart.")
# Assuming _select_data is a method that filters the data based on the time_period
df = self._select_data(time_period)
df['index_col'] = df.index
df['MA'] = df['Close'].rolling(window=ma_period).mean()
inc = df.Close > df.Open
dec = df.Open > df.Close
source_inc = ColumnDataSource(df[inc])
source_dec = ColumnDataSource(df[dec])
source_hover = ColumnDataSource(df)
w = 12 * 60 * 60 * 1000
TOOLS = "pan,wheel_zoom,box_zoom,reset,save"
p = figure(x_axis_type="datetime", tools=TOOLS, width=1400, title="Crypto Candlestick with MA")
p.xaxis.major_label_orientation = pi / 4
p.grid.grid_line_alpha = 0.3
p.segment('index_col', 'High', 'index_col', 'Low', color="black", source=source_inc)
p.vbar('index_col', w, 'Open', 'Close', fill_color="#39B86B", line_color="black", source=source_inc)
p.segment('index_col', 'High', 'index_col', 'Low', color="black", source=source_dec)
p.vbar('index_col', w, 'Open', 'Close', fill_color="#F2583E", line_color="black", source=source_dec)
hover = HoverTool(
tooltips=[
("Date", "@index_col{%F}"),
("Open", "@{Open}{($ 0,0.00)}"),
("Close", "@{Close}{($ 0,0.00)}"),
("High", "@{High}{($ 0,0.00)}"),
("Low", "@{Low}{($ 0,0.00)}"),
("MA", "@{MA}{($ 0,0.00)}")
],
formatters={
'@index_col': 'datetime',
'@Open': 'numeral',
'@Close': 'numeral',
'@High': 'numeral',
'@Low': 'numeral',
'@MA': 'numeral'
},
mode='vline'
)
p.add_tools(hover)
p.line('index_col', 'MA', color='blue', legend_label='Moving Average', source=source_hover)
if display:
show(p)
logger.info('Candlestick chart displayed.')
return p
def plot_trend_bokeh(self, display=True):
"""
Plots trend data using various moving averages for analysis.
"""
logger.info("Creating trend analysis plot.")
# Assuming _identify_trend is a method that identifies the trend in the data
trend_data = self._identify_trend()
source = ColumnDataSource(data={**{'x': self.data.index, 'price': trend_data['Price']}, **{f"mavg{period}": trend_data[f"{period}_day_mavg"] for period in [3, 7, 15, 40, 90, 120]}})
p = figure(width=1400, height=600, title="Trend Analysis using Moving Averages", x_axis_type="datetime")
p.line(x='x', y='price', source=source, legend_label="Close Price", alpha=0.8)
colors = {"3": "orange", "7": "yellow", "15": "cyan", "40": "red", "90": "purple", "120": "green"}
for period, color in colors.items():
p.line(x='x', y=f'mavg{period}', source=source, legend_label=f"{period}-day MA", color=color, line_dash="dashed")
hover = HoverTool(
tooltips=[
("Date", "@x{%F}"),
("Price", "@price{$0,0.00} K"),
("3-day MA", "@mavg3{$0,0.00} K"),
("7-day MA", "@mavg7{$0,0.00} K"),
("15-day MA", "@mavg15{$0,0.00} K"),
("40-day MA", "@mavg40{$0,0.00} K"),
("90-day MA", "@mavg90{$0,0.00} K"),
("120-day MA", "@mavg120{$0,0.00} K")
],
formatters={"@x": "datetime"}
)
p.yaxis.formatter = NumeralTickFormatter(format="$0,0.00")
p.add_tools(hover)
if display:
show(p)
logger.info('Trend plot displayed.')
return p
def _identify_trend(self, column: str = 'Close'):
signals = pd.DataFrame(index=self.data.index)
signals['Price'] = self.data[column]
# Moving Averages
ma_periods = [3, 7, 15, 40, 90, 120]
for period in ma_periods:
signals[f'{period}_day_mavg'] = self.data[column].rolling(window=period, min_periods=1, center=False).mean()
# Signal based on 40-day and 120-day moving averages (since there's no 100-day moving average in the new setup)
signals['signal'] = 0.0
signals['signal'][40:] = np.where(signals['40_day_mavg'][40:] > signals['120_day_mavg'][40:], 1.0, 0.0)
return signals
def _select_data(self, time_period):
logger.info("Selecting data for time period: %s", time_period)
if time_period == 'last_month':
last_month = self.data.index.max() - pd.DateOffset(months=1)
df = self.data[self.data.index >= last_month]
elif time_period == 'last_3_months':
last_3_months = self.data.index.max() - pd.DateOffset(months=3)
df = self.data[self.data.index >= last_3_months]
elif time_period == 'last_6_months':
last_6_months = self.data.index.max() - pd.DateOffset(months=6)
df = self.data[self.data.index >= last_6_months]
elif time_period == 'last_1_year':
last_1_year = self.data.index.max() - pd.DateOffset(years=1)
df = self.data[self.data.index >= last_1_year]
elif time_period == 'last_3_years':
last_3_years = self.data.index.max() - pd.DateOffset(years=3)
df = self.data[self.data.index >= last_3_years]
else:
df = self.data
return df
crypto_analytics = CryptoAnalyticsVisual(data)
candle = crypto_analytics.create_candlestick_chart(time_period='last_6_months', ma_period=20)
trend = crypto_analytics.plot_trend_bokeh()
bollinger_bands = crypto_analytics.plot_bollinger_bands_bokeh()
macd = crypto_analytics.plot_macd_bokeh()
rsi = crypto_analytics.plot_rsi_bokeh()
fibonacci_retracement = crypto_analytics.plot_fibonacci_retracement_bokeh()
volume = crypto_analytics.plot_volume_analysis_bokeh()
2023-10-02 12:56:03,178 [INFO] - CryptoAnalyticsVisual instance created and initialized. 2023-10-02 12:56:03,179 [INFO] - Creating candlestick chart. 2023-10-02 12:56:03,179 [INFO] - Selecting data for time period: last_6_months
2023-10-02 12:56:03,545 [INFO] - Candlestick chart displayed. 2023-10-02 12:56:03,548 [INFO] - Creating trend analysis plot.
2023-10-02 12:56:04,069 [INFO] - Trend plot displayed. 2023-10-02 12:56:04,095 [INFO] - Bollinger Bands calculated with window=20 and num_std=2 2023-10-02 12:56:04,168 [INFO] - Bollinger Bands plot displayed.
2023-10-02 12:56:04,528 [INFO] - MACD calculated with short_window=12, long_window=26, and signal_window=9 2023-10-02 12:56:04,595 [INFO] - MACD plot displayed.
2023-10-02 12:56:04,819 [INFO] - RSI calculated with window=14 2023-10-02 12:56:04,886 [INFO] - RSI plot displayed.
2023-10-02 12:56:05,276 [INFO] - Fibonacci retracement levels calculated. 2023-10-02 12:56:05,373 [INFO] - Fibonacci retracement plot displayed.
2023-10-02 12:56:05,643 [INFO] - Volume analysis completed. 2023-10-02 12:56:05,694 [INFO] - Volume analysis plot displayed.
crypto_analytics.save_plot_to_file(candle, 'candle.html')
crypto_analytics.save_plot_to_file(trend, 'trend.html')
crypto_analytics.save_plot_to_file(bollinger_bands, 'bollinger_bands.html')
crypto_analytics.save_plot_to_file(macd, 'macd.html')
crypto_analytics.save_plot_to_file(rsi, 'rsi.html')
crypto_analytics.save_plot_to_file(fibonacci_retracement, 'fibonacci_retracement.html')
crypto_analytics.save_plot_to_file(volume, 'rsi.html')
2023-10-02 12:56:06,373 [INFO] - Plot saved to file: visualizations_assets\candle.html 2023-10-02 12:56:06,533 [INFO] - Plot saved to file: visualizations_assets\trend.html 2023-10-02 12:56:06,652 [INFO] - Plot saved to file: visualizations_assets\bollinger_bands.html 2023-10-02 12:56:06,761 [INFO] - Plot saved to file: visualizations_assets\macd.html 2023-10-02 12:56:06,877 [INFO] - Plot saved to file: visualizations_assets\rsi.html 2023-10-02 12:56:07,089 [INFO] - Plot saved to file: visualizations_assets\fibonacci_retracement.html 2023-10-02 12:56:07,215 [INFO] - Plot saved to file: visualizations_assets\rsi.html
class Feature_Eng_Tech:
"""
The Feature_Eng_Tech class is responsible for applying various feature engineering techniques on time series data.
Attributes:
df (pd.DataFrame): Original time series data.
target_column (str): Target column for which features are being generated.
data_eng (pd.DataFrame): DataFrame with engineered features.
logger (logging.Logger): Logger for tracking operations and debugging.
Methods:
reset_data: Resets the engineered data to its original state.
handle_missing_values: Handles missing values in the DataFrame.
add_date_features: Adds date-related features like year, month, day, and optionally day of the week.
add_lag_features: Adds lag features based on a given window size.
add_rolling_features: Adds rolling window features like mean and standard deviation.
add_expanding_window_features: Adds expanding window features like mean, min, max, and sum.
add_seasonal_decomposition: Adds seasonal decomposition features like trend, seasonality, and residuals.
detrend_data: Detrends the time series data.
add_holiday_features: Adds a feature to indicate holidays.
add_fourier_features: Adds Fourier features based on a given period and order.
handle_nan_values_post_engineering: Handles NaN values post feature engineering.
feature_engineering: Applies multiple feature engineering methods based on a configuration dictionary.
get_engineered_data: Returns the DataFrame with engineered features.
"""
def __init__(self, df: pd.DataFrame, target_column: str):
if not isinstance(df, pd.DataFrame):
raise ValueError("Input data should be a pandas DataFrame.")
if target_column not in df.columns:
raise ValueError(f"Target column {target_column} not found in DataFrame.")
self.df = df.copy()
self.target_column = target_column
self.data_eng = self.df.copy()
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] - %(message)s')
self.logger = logging.getLogger(__name__)
self.logger.info("Initialized Feature_Eng_Tech.")
def reset_data(self):
"""Resets the engineered data to its original state."""
self.data_eng = self.df.copy()
self.logger.info("Reset data to the original state.")
def handle_missing_values(self, method: str = 'ffill'):
"""
Handles missing values in the DataFrame.
Parameters:
method (str): Method to handle missing values ('ffill', 'bfill', 'interpolate', 'drop'). Default is 'ffill'.
"""
if method not in ['ffill', 'bfill', 'interpolate', 'drop']:
raise ValueError("Invalid method for handling missing values. Choose 'ffill', 'bfill', 'interpolate', or 'drop'.")
if self.df.isnull().sum().sum() > 0:
self.df.fillna(method=method, inplace=True)
self.logger.info(f"Handled missing values using {method} method.")
else:
self.logger.info("No missing values detected.")
def add_date_features(self, include_day_of_week: bool = True):
"""
Adds date-related features like year, month, day, and optionally day of the week.
Parameters:
include_day_of_week (bool): Whether to include the day of the week as a feature. Default is True.
"""
if not isinstance(self.df.index, pd.DatetimeIndex):
self.df.index = pd.to_datetime(self.df.index)
self.df['year'] = self.df.index.year
self.df['month'] = self.df.index.month
self.df['day'] = self.df.index.day
if include_day_of_week:
self.df['day_of_week'] = self.df.index.dayofweek
self.logger.info("Date-related features added.")
def add_lag_features(self, window: int = 3):
"""
Adds lag features based on a given window size.
Parameters:
window (int): The window size for creating lag features. Default is 3.
"""
if window > len(self.data_eng):
raise ValueError("The window parameter should be less than the length of the time series data.")
for i in range(1, window + 1):
self.data_eng[f"lag_{i}"] = self.data_eng[self.target_column].shift(i)
self.logger.info(f'Added lag features with window size {window}.')
def add_rolling_features(self, window: int = 3, min_periods: int = 1):
"""
Adds rolling window features like mean and standard deviation.
Parameters:
window (int): The window size for rolling features. Default is 3.
min_periods (int): Minimum number of observations required to have a value. Default is 1.
"""
self.data_eng[f"rolling_mean_{window}"] = self.data_eng[self.target_column].rolling(window=window, min_periods=min_periods).mean()
self.data_eng[f"rolling_std_{window}"] = self.data_eng[self.target_column].rolling(window=window, min_periods=min_periods).std()
self.logger.info(f'Added rolling window features with window size {window}.')
def add_expanding_window_features(self, min_periods: int = 1):
"""
Adds expanding window features like mean, min, max, and sum.
Parameters:
min_periods (int): Minimum number of observations required to have a value. Default is 1.
"""
self.data_eng['expanding_mean'] = self.data_eng[self.target_column].expanding(min_periods=min_periods).mean()
self.data_eng['expanding_min'] = self.data_eng[self.target_column].expanding(min_periods=min_periods).min()
self.data_eng['expanding_max'] = self.data_eng[self.target_column].expanding(min_periods=min_periods).max()
self.data_eng['expanding_sum'] = self.data_eng[self.target_column].expanding(min_periods=min_periods).sum()
self.logger.info('Added expanding window features.')
def add_seasonal_decomposition(self, period: int = 12, model: str = 'additive'):
"""
Adds seasonal decomposition features like trend, seasonality, and residuals.
Parameters:
period (int): The period for seasonal decomposition. Default is 12.
model (str): The model type for seasonal decomposition ('additive' or 'multiplicative'). Default is 'additive'.
"""
result = seasonal_decompose(self.data_eng[self.target_column], period=period, model=model)
self.data_eng['trend'] = result.trend
self.data_eng['seasonal'] = result.seasonal
self.data_eng['residual'] = result.resid
self.logger.info(f'Added seasonal decomposition with period {period} and model {model}.')
def detrend_data(self):
"""Detrends the time series data."""
self.data_eng['detrended'] = detrend(self.data_eng[self.target_column])
self.logger.info('Detrended the data.')
def add_holiday_features(self):
"""Adds a feature to indicate holidays."""
cal = USFederalHolidayCalendar()
holidays = cal.holidays(start=self.data_eng.index.min(), end=self.data_eng.index.max())
self.data_eng['is_holiday'] = self.data_eng.index.isin(holidays).astype(int)
self.logger.info('Added holiday features.')
def add_fourier_features(self, period: int, order: int):
"""
Adds Fourier features based on a given period and order.
Parameters:
period (int): The period for Fourier features.
order (int): The order for Fourier features.
"""
for i in range(1, order + 1):
self.data_eng[f'fourier_sin_{i}'] = np.sin(2 * i * np.pi * self.data_eng.index.dayofyear / period)
self.data_eng[f'fourier_cos_{i}'] = np.cos(2 * i * np.pi * self.data_eng.index.dayofyear / period)
self.logger.info(f'Added Fourier features with period {period} and order {order}.')
def handle_nan_values_post_engineering(self, method: str = 'drop'):
"""
Handles NaN values post feature engineering.
Parameters:
method (str): Method to handle missing values ('drop', 'ffill', 'bfill'). Default is 'drop'.
"""
if method == 'drop':
self.data_eng.dropna(inplace=True)
elif method == 'ffill':
self.data_eng.fillna(method='ffill', inplace=True)
elif method == 'bfill':
self.data_eng.fillna(method='bfill', inplace=True)
else:
raise ValueError("Invalid method. Choose 'drop', 'ffill', or 'bfill'.")
self.logger.info(f"Handled NaN values using {method} method.")
def feature_engineering(self, config: dict):
"""
Applies multiple feature engineering methods based on a configuration dictionary.
Parameters:
config (dict): A dictionary with the configuration for feature engineering.
"""
feature_methods = {
"handle_missing_values": self.handle_missing_values,
"add_date_features": self.add_date_features,
"add_lag_features": self.add_lag_features,
"add_rolling_features": self.add_rolling_features,
"add_expanding_window_features": self.add_expanding_window_features,
"add_seasonal_decomposition": self.add_seasonal_decomposition,
"detrend_data": self.detrend_data,
"add_holiday_features": self.add_holiday_features,
"add_fourier_features": lambda: self.add_fourier_features(config.get("fourier_period", 365), config.get("fourier_order", 3))
}
for feature, method in feature_methods.items():
if config.get(feature):
method()
self.handle_nan_values_post_engineering()
self.logger.info('Feature engineering steps applied based on configuration.')
def get_engineered_data(self) -> pd.DataFrame:
"""
Returns the DataFrame with engineered features.
Returns:
pd.DataFrame: DataFrame containing the engineered features.
"""
return self.data_eng.copy()
feature_eng = Feature_Eng_Tech(data, target_column='Close')
# Define a configuration for feature engineering
config = {
"handle_missing_values": True,
"add_date_features": True,
"add_lag_features": True,
"add_rolling_features": True,
"add_expanding_window_features": True,
"add_seasonal_decomposition": True,
"detrend_data": True,
"add_holiday_features": True,
"add_fourier_features": True,
}
# Apply feature engineering based on the configuration
feature_eng.feature_engineering(config)
# Get the engineered data
data_eng = feature_eng.get_engineered_data()
data_eng
2023-10-02 12:56:07,300 [INFO] - Initialized Feature_Eng_Tech. 2023-10-02 12:56:07,303 [INFO] - No missing values detected. 2023-10-02 12:56:07,303 [INFO] - Date-related features added. 2023-10-02 12:56:07,318 [INFO] - Added lag features with window size 3. 2023-10-02 12:56:07,318 [INFO] - Added rolling window features with window size 3. 2023-10-02 12:56:07,340 [INFO] - Added expanding window features. 2023-10-02 12:56:07,344 [INFO] - Added seasonal decomposition with period 12 and model additive. 2023-10-02 12:56:07,344 [INFO] - Detrended the data. 2023-10-02 12:56:07,380 [INFO] - Added holiday features. 2023-10-02 12:56:07,386 [INFO] - Added Fourier features with period 365 and order 3. 2023-10-02 12:56:07,386 [INFO] - Handled NaN values using drop method. 2023-10-02 12:56:07,386 [INFO] - Feature engineering steps applied based on configuration.
| Open | High | Low | Close | Volume | Market Cap | lag_1 | lag_2 | lag_3 | rolling_mean_3 | ... | seasonal | residual | detrended | is_holiday | fourier_sin_1 | fourier_cos_1 | fourier_sin_2 | fourier_cos_2 | fourier_sin_3 | fourier_cos_3 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2013-05-04 | 98.100 | 115.000 | 92.500 | 112.500 | 0.000 | 1250316562.500 | 97.750 | 105.210 | 116.990 | 105.153 | ... | 31.201 | -35.744 | 8164.979 | 0 | 0.845 | -0.534 | -0.903 | -0.429 | 0.120 | 0.993 |
| 2013-05-05 | 112.900 | 118.800 | 107.140 | 115.910 | 0.000 | 1288693175.500 | 112.500 | 97.750 | 105.210 | 108.720 | ... | -41.853 | 42.389 | 8157.730 | 0 | 0.836 | -0.549 | -0.918 | -0.398 | 0.171 | 0.985 |
| 2013-05-06 | 115.980 | 124.660 | 106.640 | 112.300 | 0.000 | 1249023060.000 | 115.910 | 112.500 | 97.750 | 113.570 | ... | 15.317 | -16.170 | 8143.461 | 0 | 0.826 | -0.563 | -0.931 | -0.366 | 0.222 | 0.975 |
| 2013-05-07 | 112.250 | 113.440 | 97.700 | 111.500 | 0.000 | 1240593600.000 | 112.300 | 115.910 | 112.500 | 113.237 | ... | -20.387 | 19.692 | 8132.002 | 0 | 0.817 | -0.577 | -0.943 | -0.333 | 0.272 | 0.962 |
| 2013-05-08 | 109.600 | 115.780 | 109.600 | 113.570 | 0.000 | 1264049202.150 | 111.500 | 112.300 | 115.910 | 112.457 | ... | -28.438 | 29.510 | 8123.413 | 0 | 0.806 | -0.591 | -0.954 | -0.301 | 0.321 | 0.947 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-09-21 | 27129.840 | 27152.940 | 26389.300 | 26567.630 | 13371443707.650 | 517864270862.510 | 27132.010 | 27211.120 | 26754.280 | 26970.253 | ... | 31.201 | -61.882 | -5798.673 | 0 | -0.986 | -0.167 | 0.329 | -0.944 | 0.876 | 0.483 |
| 2023-09-22 | 26564.060 | 26726.080 | 26495.530 | 26579.570 | 10578746708.520 | 518118003343.510 | 26567.630 | 27132.010 | 27211.120 | 26759.737 | ... | -41.853 | 14.891 | -5797.392 | 0 | -0.989 | -0.150 | 0.297 | -0.955 | 0.900 | 0.437 |
| 2023-09-23 | 26578.560 | 26634.190 | 26520.520 | 26579.390 | 7404700300.940 | 518140112048.320 | 26579.570 | 26567.630 | 27132.010 | 26575.530 | ... | 15.317 | -77.076 | -5808.231 | 0 | -0.991 | -0.133 | 0.264 | -0.965 | 0.921 | 0.390 |
| 2023-09-24 | 26579.370 | 26716.060 | 26221.050 | 26256.830 | 8192867685.970 | 511876976253.540 | 26579.390 | 26579.570 | 26567.630 | 26471.930 | ... | -20.387 | -388.564 | -6141.450 | 0 | -0.993 | -0.116 | 0.230 | -0.973 | 0.940 | 0.342 |
| 2023-09-25 | 26253.780 | 26421.510 | 26011.470 | 26298.480 | 11997833256.530 | 512712185601.520 | 26256.830 | 26579.390 | 26579.570 | 26378.233 | ... | -28.438 | -379.958 | -6110.459 | 0 | -0.995 | -0.099 | 0.197 | -0.980 | 0.956 | 0.293 |
3797 rows × 26 columns
logger = logging.getLogger(__name__)
class TimeSeriesAnalysis:
"""
A class to perform various time series analysis tasks such as stationarity checks, volatility modeling, and decomposition.
Attributes:
data (pd.DataFrame): Time series data.
target (str): Target column for time series analysis.
"""
def __init__(self, data, target):
"""
Initialize the TimeSeriesAnalysis class.
Parameters:
data (pd.DataFrame): Time series data.
target (str): Target column for time series analysis.
"""
logger.info("Initializing TimeSeriesAnalysis class")
if target not in data.columns:
raise ValueError(f"'{target}' is not a column in the provided data.")
self.data = data
self.target = target
self.alpha = 0.05
def save_and_show_plot(self, fig, filename, show=True):
"""
Utility method to save and display the plot.
Parameters:
fig (matplotlib.figure.Figure): The plot figure.
filename (str): Filename to save the plot.
show (bool, optional): Whether to display the plot. Default is True.
"""
if not os.path.exists('ts_plots_assets'):
os.makedirs('ts_plots_assets')
path = os.path.join('ts_plots_assets', filename)
fig.savefig(path)
logger.info(f"Plot saved to: {path}")
if show:
plt.show()
def check_stationarity(self):
"""
Check the stationarity of the time series data using the Augmented Dickey-Fuller test.
Returns:
tuple: ADF Statistic, p-value, and critical values.
"""
logger.info("Checking stationarity of the time series")
result = adfuller(self.data[self.target])
print('-'*60)
print('ADF Statistic:', result[0])
print('p-value:', result[1])
print('Critical Values:')
for key, value in result[4].items():
print('\t{}: {}'.format(key, value))
if result[1] <= 0.05:
print('The series is likely stationary.')
else:
print('The series is likely non-stationary.')
print('-'*60)
print("\n")
return result[0], result[1], result[4]
def check_autocorrelation(self):
"""
Check the autocorrelation of the time series using ACF and PACF plots.
Returns:
tuple: ACF and PACF figures.
"""
logger.info("Checking autocorrelation of the time series")
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(20,6))
plot_acf(self.data[self.target], lags=50, alpha=0.05, ax=ax1)
ax1.set_title("ACF for {}".format(self.target))
plot_pacf(self.data[self.target], lags=50, alpha=0.05, method='ols', ax=ax2)
ax2.set_title("PACF for {}".format(self.target))
self.save_and_show_plot(fig, 'autocorrelation.png')
return fig
def check_volatility(self, p=1, q=1):
"""
Check volatility using GARCH model.
Parameters:
p (int, optional): The number of lag observations to include in the GARCH model. Default is 1.
q (int, optional): The number of lag forecast errors to include in the GARCH model. Default is 1.
Returns:
str: Summary of the GARCH model fit.
"""
logger.info("Checking volatility of the time series")
try:
model = arch_model(self.data[self.target], vol='Garch', p=p, q=q)
model_fit = model.fit(disp='off')
summary_str = model_fit.summary().as_text()
print(summary_str)
return summary_str
except Exception as e:
print(f"Error encountered: {e}")
return f"Error encountered: {e}"
def decompose_time_series(self, model='additive', period=30, show=True):
"""
Decompose the time series data into trend, seasonal, and residual components.
Parameters:
model (str): The type of decomposition model ('additive' or 'multiplicative').
period (int): The period for seasonal decomposition.
show (bool): Whether to display the plot.
Returns:
fig (matplotlib.Figure): The figure object containing the decomposition plots.
"""
logger.info("Decomposing the time series")
result = seasonal_decompose(self.data[self.target], model=model, period=period)
# Adjusting the figsize here
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(20, 12))
result.observed.plot(ax=ax1)
ax1.set_title('Observed')
result.trend.plot(ax=ax2)
ax2.set_title('Trend')
result.seasonal.plot(ax=ax3)
ax3.set_title('Seasonal')
result.resid.plot(ax=ax4)
ax4.set_title('Residual')
self.save_and_show_plot(fig, 'decompose.png', show=show)
return fig
def test_granger_causality(self, other_column, maxlag=30, verbose=False):
"""Test Granger Causality between target and another time series column.
Parameters:
other_column (str): The name of the other column to test for Granger Causality.
maxlag (int): The maximum number of lags to consider for the test.
verbose (bool): Whether to display detailed output.
Returns:
dict: A dictionary containing the Granger Causality test results.
"""
logger.info("Testing Granger causality")
if other_column not in self.data.columns:
raise ValueError(f"'{other_column}' is not a column in the provided data.")
other_data = self.data[other_column].values
target_data = self.data[self.target].values
data = np.column_stack((target_data, other_data))
result = grangercausalitytests(data, maxlag=maxlag, verbose=verbose)
return result
def concise_granger_output_table(self, granger_results):
"""Generate a concise report from the Granger Causality test results in a table format."""
table_content = ['<table border="1" style="border-collapse:collapse;">']
lags = list(granger_results.keys())
for i in range(0, len(lags), 6):
table_content.append('<tr>')
for j in range(6):
if i + j < len(lags):
lag = lags[i + j]
test_statistics = granger_results[lag][0]
cell_content = (f"<b>Lag: {lag}</b><br>"
f"ssr_ftest: F={test_statistics['ssr_ftest'][0]:.4f}, p={test_statistics['ssr_ftest'][1]:.4f}<br>"
f"ssr_chi2test: chi2={test_statistics['ssr_chi2test'][0]:.4f}, p={test_statistics['ssr_chi2test'][1]:.4f}<br>"
f"lrtest: chi2={test_statistics['lrtest'][0]:.4f}, p={test_statistics['lrtest'][1]:.4f}<br>"
f"params_ftest: F={test_statistics['params_ftest'][0]:.4f}, p={test_statistics['params_ftest'][1]:.4f}")
table_content.append(f'<td style="padding: 8px; text-align: left;">{cell_content}</td>')
table_content.append('</tr>')
table_content.append('</table>')
return "\n".join(table_content)
def check_jarque_bera(self, alpha=None):
"""
Perform Jarque-Bera test to check for normality.
Parameters:
alpha (float, optional): Significance level. Defaults to class-level alpha.
Returns:
dict: Test results including test statistic, p-value, and conclusion.
"""
alpha = alpha if alpha is not None else self.alpha
logger.info("Performing Jarque-Bera test")
jb_value, p_value = jarque_bera(self.data[self.target])
conclusion = "Fail to reject the null hypothesis: The series likely follows a normal distribution." if p_value > alpha else "Reject the null hypothesis: The series likely does not follow a normal distribution."
result = {
'test_statistic': jb_value,
'p_value': p_value,
'alpha': alpha,
'null_hypothesis': 'The series follows a normal distribution.',
'conclusion': conclusion
}
output_str = (f"Jarque-Bera test statistic: {jb_value}\n"
f"p-value: {p_value}\n"
f"Significance level: {alpha}\n"
f"{conclusion}")
print(output_str)
print("\n")
logger.info(output_str.replace("\n", " "))
return result
def check_kpss(self, alpha=None):
"""
Perform KPSS test to check for stationarity.
Parameters:
alpha (float, optional): Significance level. Defaults to class-level alpha.
Returns:
dict: Test results including test statistic, p-value, critical values, and conclusion.
"""
alpha = alpha if alpha is not None else self.alpha
logger.info("Performing KPSS test")
kpss_value, p_value, _, crit = kpss(self.data[self.target])
conclusion = "Fail to reject the null hypothesis: The series is likely non-stationary." if p_value > alpha else "Reject the null hypothesis: The series is likely stationary."
result = {
'test_statistic': kpss_value,
'p_value': p_value,
'alpha': alpha,
'critical_values': crit,
'null_hypothesis': 'The series is stationary around a constant.',
'conclusion': conclusion
}
output_str = (f"KPSS test statistic: {kpss_value}\n"
f"p-value: {p_value}\n"
f"Critical values: {crit}\n"
f"Significance level: {alpha}\n"
f"{conclusion}")
print(output_str)
print("\n")
logger.info(output_str.replace("\n", " "))
return result
def check_ks_test(self, alpha=None, dist='norm'):
"""
Perform Kolmogorov-Smirnov test for goodness of fit.
Parameters:
alpha (float, optional): Significance level. Defaults to class-level alpha.
dist (str, optional): The distribution to test against. Defaults to 'norm'.
Returns:
dict: Test results including test statistic, p-value, and conclusion.
"""
alpha = alpha if alpha is not None else self.alpha
logger.info("Performing Kolmogorov-Smirnov test")
ks_value, p_value = kstest(self.data[self.target], dist)
conclusion = "Fail to reject the null hypothesis: The series likely follows the specified distribution." if p_value > alpha else "Reject the null hypothesis: The series likely does not follow the specified distribution."
result = {
'test_statistic': ks_value,
'p_value': p_value,
'alpha': alpha,
'null_hypothesis': f'The series follows the {dist} distribution.',
'conclusion': conclusion
}
output_str = (f"Kolmogorov-Smirnov test statistic: {ks_value}\n"
f"p-value: {p_value}\n"
f"Significance level: {alpha}\n"
f"{conclusion}")
print(output_str)
print("\n")
logger.info(output_str.replace("\n", " "))
return result
from IPython.core.display import display, HTML
# Instantiate and call methods
tsa = TimeSeriesAnalysis(data, target='Close')
# Store and display the results
autocorr_fig = tsa.check_autocorrelation()
decomposition = tsa.decompose_time_series(show=True) # Set show=False if you don't want to display the plot
adf_stat, p_value, crit_values = tsa.check_stationarity()
jb_results = tsa.check_jarque_bera(alpha=0.05)
kpss_results = tsa.check_kpss(alpha=0.05)
ks_results = tsa.check_ks_test(alpha=0.05)
volatility_summary = tsa.check_volatility()
granger_results = tsa.test_granger_causality('Open', maxlag=30, verbose=False)
display(HTML(tsa.concise_granger_output_table(granger_results)))
2023-10-02 12:56:07,469 [INFO] - Initializing TimeSeriesAnalysis class 2023-10-02 12:56:07,469 [INFO] - Checking autocorrelation of the time series 2023-10-02 12:56:08,402 [INFO] - Plot saved to: ts_plots_assets\autocorrelation.png
2023-10-02 12:56:08,568 [INFO] - Decomposing the time series 2023-10-02 12:56:09,461 [INFO] - Plot saved to: ts_plots_assets\decompose.png
2023-10-02 12:56:10,094 [INFO] - Checking stationarity of the time series
2023-10-02 12:56:10,623 [INFO] - Performing Jarque-Bera test
2023-10-02 12:56:10,626 [INFO] - Jarque-Bera test statistic: 1711.9759230184734 p-value: 0.0 Significance level: 0.05 Reject the null hypothesis: The series likely does not follow a normal distribution.
2023-10-02 12:56:10,627 [INFO] - Performing KPSS test
2023-10-02 12:56:10,630 [INFO] - KPSS test statistic: 6.100085065671445 p-value: 0.01 Critical values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739} Significance level: 0.05 Reject the null hypothesis: The series is likely stationary.
2023-10-02 12:56:10,631 [INFO] - Performing Kolmogorov-Smirnov test
2023-10-02 12:56:10,634 [INFO] - Kolmogorov-Smirnov test statistic: 1.0 p-value: 0.0 Significance level: 0.05 Reject the null hypothesis: The series likely does not follow the specified distribution.
2023-10-02 12:56:10,635 [INFO] - Checking volatility of the time series
2023-10-02 12:56:10,693 [INFO] - Testing Granger causality
------------------------------------------------------------
ADF Statistic: -1.5742601333691042
p-value: 0.496509272280352
Critical Values:
1%: -3.432081608223884
5%: -2.862305129189198
10%: -2.567177288520333
The series is likely non-stationary.
------------------------------------------------------------
Jarque-Bera test statistic: 1711.9759230184734
p-value: 0.0
Significance level: 0.05
Reject the null hypothesis: The series likely does not follow a normal distribution.
KPSS test statistic: 6.100085065671445
p-value: 0.01
Critical values: {'10%': 0.347, '5%': 0.463, '2.5%': 0.574, '1%': 0.739}
Significance level: 0.05
Reject the null hypothesis: The series is likely stationary.
Kolmogorov-Smirnov test statistic: 1.0
p-value: 0.0
Significance level: 0.05
Reject the null hypothesis: The series likely does not follow the specified distribution.
Constant Mean - GARCH Model Results
==============================================================================
Dep. Variable: Close R-squared: 0.000
Mean Model: Constant Mean Adj. R-squared: 0.000
Vol Model: GARCH Log-Likelihood: -38132.5
Distribution: Normal AIC: 76273.0
Method: Maximum Likelihood BIC: 76298.0
No. Observations: 3809
Date: Mon, Oct 02 2023 Df Residuals: 3808
Time: 12:56:10 Df Model: 1
Mean Model
============================================================================
coef std err t P>|t| 95.0% Conf. Int.
----------------------------------------------------------------------------
mu 997.2924 33.198 30.040 2.914e-198 [9.322e+02,1.062e+03]
Volatility Model
=============================================================================
coef std err t P>|t| 95.0% Conf. Int.
-----------------------------------------------------------------------------
omega 4.8437e+06 1.251e+05 38.726 0.000 [4.599e+06,5.089e+06]
alpha[1] 0.8949 3.935e-02 22.743 1.702e-114 [ 0.818, 0.972]
beta[1] 1.1027e-14 4.086e-02 2.699e-13 1.000 [-8.009e-02,8.009e-02]
=============================================================================
Covariance estimator: robust
| Lag: 1 ssr_ftest: F=2.1596, p=0.1418 ssr_chi2test: chi2=2.1613, p=0.1415 lrtest: chi2=2.1607, p=0.1416 params_ftest: F=2.1596, p=0.1418 |
Lag: 2 ssr_ftest: F=0.1844, p=0.8316 ssr_chi2test: chi2=0.3694, p=0.8314 lrtest: chi2=0.3694, p=0.8314 params_ftest: F=0.1844, p=0.8316 |
Lag: 3 ssr_ftest: F=0.8888, p=0.4461 ssr_chi2test: chi2=2.6713, p=0.4451 lrtest: chi2=2.6704, p=0.4453 params_ftest: F=0.8888, p=0.4461 |
Lag: 4 ssr_ftest: F=1.6688, p=0.1543 ssr_chi2test: chi2=6.6912, p=0.1531 lrtest: chi2=6.6853, p=0.1535 params_ftest: F=1.6688, p=0.1543 |
Lag: 5 ssr_ftest: F=0.8054, p=0.5456 ssr_chi2test: chi2=4.0384, p=0.5439 lrtest: chi2=4.0363, p=0.5442 params_ftest: F=0.8054, p=0.5456 |
Lag: 6 ssr_ftest: F=1.0842, p=0.3692 ssr_chi2test: chi2=6.5276, p=0.3667 lrtest: chi2=6.5220, p=0.3673 params_ftest: F=1.0842, p=0.3692 |
| Lag: 7 ssr_ftest: F=2.2702, p=0.0263 ssr_chi2test: chi2=15.9541, p=0.0255 lrtest: chi2=15.9207, p=0.0259 params_ftest: F=2.2702, p=0.0263 |
Lag: 8 ssr_ftest: F=1.6964, p=0.0940 ssr_chi2test: chi2=13.6324, p=0.0919 lrtest: chi2=13.6080, p=0.0926 params_ftest: F=1.6964, p=0.0940 |
Lag: 9 ssr_ftest: F=4.9523, p=0.0000 ssr_chi2test: chi2=44.7946, p=0.0000 lrtest: chi2=44.5327, p=0.0000 params_ftest: F=4.9523, p=0.0000 |
Lag: 10 ssr_ftest: F=2.2712, p=0.0120 ssr_chi2test: chi2=22.8378, p=0.0114 lrtest: chi2=22.7694, p=0.0116 params_ftest: F=2.2712, p=0.0120 |
Lag: 11 ssr_ftest: F=2.2786, p=0.0091 ssr_chi2test: chi2=25.2177, p=0.0085 lrtest: chi2=25.1344, p=0.0087 params_ftest: F=2.2786, p=0.0091 |
Lag: 12 ssr_ftest: F=3.7982, p=0.0000 ssr_chi2test: chi2=45.8811, p=0.0000 lrtest: chi2=45.6061, p=0.0000 params_ftest: F=3.7982, p=0.0000 |
| Lag: 13 ssr_ftest: F=2.8854, p=0.0004 ssr_chi2test: chi2=37.7791, p=0.0003 lrtest: chi2=37.5923, p=0.0003 params_ftest: F=2.8854, p=0.0004 |
Lag: 14 ssr_ftest: F=4.6439, p=0.0000 ssr_chi2test: chi2=65.5157, p=0.0000 lrtest: chi2=64.9566, p=0.0000 params_ftest: F=4.6439, p=0.0000 |
Lag: 15 ssr_ftest: F=4.6480, p=0.0000 ssr_chi2test: chi2=70.2938, p=0.0000 lrtest: chi2=69.6506, p=0.0000 params_ftest: F=4.6480, p=0.0000 |
Lag: 16 ssr_ftest: F=4.8005, p=0.0000 ssr_chi2test: chi2=77.4827, p=0.0000 lrtest: chi2=76.7019, p=0.0000 params_ftest: F=4.8005, p=0.0000 |
Lag: 17 ssr_ftest: F=4.4687, p=0.0000 ssr_chi2test: chi2=76.6753, p=0.0000 lrtest: chi2=75.9104, p=0.0000 params_ftest: F=4.4687, p=0.0000 |
Lag: 18 ssr_ftest: F=4.1744, p=0.0000 ssr_chi2test: chi2=75.8804, p=0.0000 lrtest: chi2=75.1310, p=0.0000 params_ftest: F=4.1744, p=0.0000 |
| Lag: 19 ssr_ftest: F=4.6556, p=0.0000 ssr_chi2test: chi2=89.3761, p=0.0000 lrtest: chi2=88.3385, p=0.0000 params_ftest: F=4.6556, p=0.0000 |
Lag: 20 ssr_ftest: F=4.4216, p=0.0000 ssr_chi2test: chi2=89.3984, p=0.0000 lrtest: chi2=88.3600, p=0.0000 params_ftest: F=4.4216, p=0.0000 |
Lag: 21 ssr_ftest: F=3.7434, p=0.0000 ssr_chi2test: chi2=79.5143, p=0.0000 lrtest: chi2=78.6912, p=0.0000 params_ftest: F=3.7434, p=0.0000 |
Lag: 22 ssr_ftest: F=3.8166, p=0.0000 ssr_chi2test: chi2=84.9760, p=0.0000 lrtest: chi2=84.0366, p=0.0000 params_ftest: F=3.8166, p=0.0000 |
Lag: 23 ssr_ftest: F=4.1247, p=0.0000 ssr_chi2test: chi2=96.0603, p=0.0000 lrtest: chi2=94.8618, p=0.0000 params_ftest: F=4.1247, p=0.0000 |
Lag: 24 ssr_ftest: F=4.3124, p=0.0000 ssr_chi2test: chi2=104.8548, p=0.0000 lrtest: chi2=103.4287, p=0.0000 params_ftest: F=4.3124, p=0.0000 |
| Lag: 25 ssr_ftest: F=3.3900, p=0.0000 ssr_chi2test: chi2=85.9083, p=0.0000 lrtest: chi2=84.9476, p=0.0000 params_ftest: F=3.3900, p=0.0000 |
Lag: 26 ssr_ftest: F=3.4149, p=0.0000 ssr_chi2test: chi2=90.0493, p=0.0000 lrtest: chi2=88.9942, p=0.0000 params_ftest: F=3.4149, p=0.0000 |
Lag: 27 ssr_ftest: F=3.7134, p=0.0000 ssr_chi2test: chi2=101.7414, p=0.0000 lrtest: chi2=100.3970, p=0.0000 params_ftest: F=3.7134, p=0.0000 |
Lag: 28 ssr_ftest: F=3.8029, p=0.0000 ssr_chi2test: chi2=108.1122, p=0.0000 lrtest: chi2=106.5954, p=0.0000 params_ftest: F=3.8029, p=0.0000 |
Lag: 29 ssr_ftest: F=4.1146, p=0.0000 ssr_chi2test: chi2=121.2153, p=0.0000 lrtest: chi2=119.3123, p=0.0000 params_ftest: F=4.1146, p=0.0000 |
Lag: 30 ssr_ftest: F=3.8556, p=0.0000 ssr_chi2test: chi2=117.5672, p=0.0000 lrtest: chi2=115.7755, p=0.0000 params_ftest: F=3.8556, p=0.0000 |
def recheck_statistical_properties(self, tsa):
"""
Re-run statistical tests to confirm transformation effectiveness.
Parameters:
tsa (TimeSeriesAnalysis): TimeSeriesAnalysis object for running statistical tests.
"""
self.logger.info("Rechecking statistical properties after transformations...")
jb_results = tsa.check_jarque_bera(alpha=0.05)
kpss_results = tsa.check_kpss(alpha=0.05)
ks_results = tsa.check_ks_test(alpha=0.05)
return jb_results, kpss_results, ks_results
class UnifiedDataPreprocessor:
"""
UnifiedDataPreprocessor is responsible for preprocessing time series data.
It performs actions like data splitting, normalization, reshaping, and sequence generation.
Attributes:
data (pd.DataFrame): Original time series data.
target_column (str): Target column for preprocessing.
logger (logging.Logger): Logger for tracking operations and debugging.
transformations (list): List of applied transformations.
"""
def __init__(self, df, target_column, logger=None):
self.data = df.copy()
self.target_column = target_column
self.scalers = {}
self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
self.X_train_seq, self.X_test_seq, self.y_train_seq, self.y_test_seq = None, None, None, None
self.logger = logger if logger else logging.getLogger(__name__)
self.transformations = []
self.lambda_val = None
self.scalers = {
"MinMax": MinMaxScaler(),
"Standard": StandardScaler(),
"Robust": RobustScaler(),
"Quantile": QuantileTransformer(output_distribution='normal'),
"Power": PowerTransformer(method='yeo-johnson')
}
self.logger.info("Initializing DataPreprocessor...")
def get_scaler(self, scaler_type):
self.logger.info(f"Getting scaler of type: {scaler_type}")
try:
return self.scalers[scaler_type]
except KeyError:
raise ValueError(f"Invalid scaler_type. Supported types are: {', '.join(self.scalers.keys())}")
def split_and_plot_data(self, test_size=0.2, split_date=None, plot=True):
"""
Split the data into training and test sets, and then plot the target variable for both sets.
:param test_size: float, optional, default 0.2. Represents the proportion of the dataset to include in the test split.
:param split_date: str, optional, default None. A date string (e.g., '2023-01-01') to split data at a specific date. If provided, this will override test_size.
"""
self.logger.info("Splitting data...")
features = self.data.drop(columns=[self.target_column])
target = self.data[self.target_column]
if split_date:
train_mask = self.data.index < split_date
self.X_train, self.X_test = features[train_mask], features[~train_mask]
self.y_train, self.y_test = target[train_mask], target[~train_mask]
else:
self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
features, target, test_size=test_size, shuffle=False
)
self.logger.info(f"Data split completed. X_train shape: {self.X_train.shape}, y_train shape: {self.y_train.shape}")
print(f"X_train shape: {self.X_train.shape}, y_train shape: {self.y_train.shape}")
print(f"X_test shape: {self.X_test.shape}, y_test shape: {self.y_test.shape}")
if plot: # Plotting
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plt.title('Training Data - Target')
plt.plot(self.y_train, label=self.target_column)
plt.xlabel("Time")
plt.ylabel("Value")
plt.legend()
plt.subplot(1, 2, 2)
plt.title('Test Data - Target')
plt.plot(self.y_test, label=self.target_column)
plt.xlabel("Time")
plt.ylabel("Value")
plt.legend()
plt.show()
def normalize_data(self, scaler_type='MinMax',plot=True):
self.logger.info("Normalizing feature data...")
scaler = self.get_scaler(scaler_type)
self.X_train = scaler.fit_transform(self.X_train)
self.X_test = scaler.transform(self.X_test)
self.scalers['features'] = scaler
self.logger.info("Feature data normalization completed.")
# Plot normalized training and test features
if plot:
plt.figure(figsize=(20, 8))
plt.subplot(1, 2, 1)
plt.title('Normalized Training Features')
for i in range(self.X_train.shape[1]):
plt.plot(self.X_train[:, i], label=f'Feature {i}')
plt.legend()
plt.subplot(1, 2, 2)
plt.title('Normalized Test Features')
for i in range(self.X_test.shape[1]):
plt.plot(self.X_test[:, i], label=f'Feature {i}')
plt.legend()
plt.show()
def normalize_target(self, scaler_type='MinMax', plot=True):
self.logger.info("Normalizing target data...")
scaler = self.get_scaler(scaler_type)
self.y_train = scaler.fit_transform(self.y_train.values.reshape(-1, 1))
self.y_test = scaler.transform(self.y_test.values.reshape(-1, 1))
self.scalers['target'] = scaler
self.logger.info("Target data normalization completed.")
if plot:# Plot normalized training and test targets
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plt.title('Normalized Training Target')
plt.plot(self.y_train, label='Normalized ' + self.target_column)
plt.legend()
plt.subplot(1, 2, 2)
plt.title('Normalized Test Target')
plt.plot(self.y_test, label='Normalized ' + self.target_column)
plt.legend()
plt.show()
def difference_and_plot_data(self, interval=1,plot=True):
"""
Apply differencing to the data and then plot it.
Parameters:
interval (int): The interval between differencing. Default is 1.
"""
self.logger.info(f"Applying differencing with interval {interval}...")
self.data = self.data.diff(periods=interval).dropna()
self.transformations.append('Differencing')
self.logger.info("Differencing applied.")
if plot:
plt.figure(figsize=(20, 7))
plt.title('Data after Differencing')
plt.plot(self.data[self.target_column], label=self.target_column)
plt.legend()
plt.show()
def box_cox_transform_and_plot(self, lambda_val=None,plot=True):
if self.y_train is None or self.y_test is None:
self.logger.warning("Data not split yet. Run split_data first.")
return self # Allow method chaining
if np.any(self.y_train <= 0) or np.any(self.y_test <= 0):
self.logger.warning("Data must be positive for Box-Cox transformation.")
return self # Allow method chaining
self.logger.info("Applying Box-Cox transformation...")
self.y_train = self.y_train.ravel()
self.y_test = self.y_test.ravel()
self.y_train, fitted_lambda = boxcox(self.y_train)
self.lambda_val = fitted_lambda if lambda_val is None else lambda_val
self.y_test = boxcox(self.y_test, lmbda=self.lambda_val)
self.logger.info(f"Box-Cox transformation applied with lambda {self.lambda_val}.")
if plot:# Plotting only the target variable
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plt.title('Box-Cox Transformed Training Target')
plt.plot(self.y_train, label='Transformed ' + self.target_column)
plt.legend()
plt.subplot(1, 2, 2)
plt.title('Box-Cox Transformed Test Target')
plt.plot(self.y_test, label='Transformed ' + self.target_column)
plt.legend()
plt.show()
def inverse_box_cox_and_plot(self,plot=True):
if f'Box-Cox on {self.target_column}' not in self.transformations:
self.logger.warning("No Box-Cox transformation found on the target column. Skipping inverse transformation.")
return
self.logger.info("Applying inverse Box-Cox transformation...")
self.y_train = invboxcox(self.y_train, self.lambda_val)
self.y_test = invboxcox(self.y_test, self.lambda_val)
self.transformations.remove(f'Box-Cox on {self.target_column}')
self.logger.info(f"Inverse Box-Cox transformation applied on column {self.target_column}.")
if plot:
plt.figure(figsize=(20, 7))
plt.subplot(1, 2, 1)
plt.title('Inverse Box-Cox Transformed Training Target')
plt.plot(self.y_train, label='Inverse Transformed ' + self.target_column)
plt.legend()
plt.subplot(1, 2, 2)
plt.title('Inverse Box-Cox Transformed Test Target')
plt.plot(self.y_test, label='Inverse Transformed ' + self.target_column)
plt.legend()
plt.show()
def reshape_for_recurrent(self, data):
self.logger.info("Reshaping data for recurrent models...")
reshaped_data = data.reshape(data.shape)
self.logger.info(f"Data reshaped to {reshaped_data.shape}.")
return reshaped_data
def generate_sequences(self, X_data, y_data, n_steps, seq_to_seq=False):
X, y = [], []
for i in range(len(X_data) - n_steps):
seq_x = X_data[i:i + n_steps, :]
if seq_to_seq:
seq_y = y_data[i:i + n_steps, :]
else:
seq_y = y_data[i + n_steps - 1]
X.append(seq_x)
y.append(seq_y)
self.logger.info(f"Generated {len(X)} sequences of shape {X[0].shape}.")
return np.array(X), np.array(y)
def prepare_data_for_recurrent(self, n_steps, seq_to_seq=False):
self.logger.info(f"Preparing data for recurrent models with {n_steps} timesteps...")
X_train_seq, y_train_seq = self.generate_sequences(self.X_train, self.y_train, n_steps, seq_to_seq)
X_test_seq, y_test_seq = self.generate_sequences(self.X_test, self.y_test, n_steps, seq_to_seq)
# Update instance variables here
self.X_train_seq = self.reshape_for_recurrent(X_train_seq)
self.X_test_seq = self.reshape_for_recurrent(X_test_seq)
self.y_train_seq = y_train_seq # Assuming y_train_seq and y_test_seq are already 2D
self.y_test_seq = y_test_seq
self.logger.info("Data preparation for recurrent models completed.")
return self.X_train_seq, self.y_train_seq, self.X_test_seq, self.y_test_seq
def prepare_for_prophet(self):
"""Prepare data for Prophet model."""
prophet_data = self.data[[self.target_column]].reset_index()
prophet_data.columns = ['ds', 'y']
return prophet_data
def get_preprocessed_data(self):
"""Get preprocessed data."""
return self.X_train, self.y_train, self.X_test, self.y_test
df = data_c['BTC'].copy()
# LSTM Sequece-to-One
tsa = TimeSeriesAnalysis(df, target='Close')
d_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
d_preprocessor.split_and_plot_data(test_size=0.2)
d_preprocessor.normalize_data(scaler_type='MinMax',plot=False)
d_preprocessor.normalize_target(scaler_type='MinMax',plot=False)
n_steps = 10
X_train_seq, y_train_seq, X_test_seq, y_test_seq = d_preprocessor.prepare_data_for_recurrent(n_steps, seq_to_seq=False)
# LSTM Sequece-to-Sequence
d_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
d_preprocessor.split_and_plot_data(test_size=0.2, plot=False)
d_preprocessor.normalize_data(scaler_type='MinMax', plot=False)
d_preprocessor.normalize_target(scaler_type='MinMax', plot=False)
n_steps = 10
X_train_seq1, y_train_seq1, X_test_seq1, y_test_seq1 = d_preprocessor.prepare_data_for_recurrent(n_steps, seq_to_seq=True)
# For Linear Regression
d_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
d_preprocessor.split_and_plot_data(test_size=0.2, plot=False)
d_preprocessor.normalize_data(scaler_type='MinMax', plot=False)
d_preprocessor.normalize_target(scaler_type='MinMax', plot=False)
X_train_lr, y_train_lr, X_test_lr, y_test_lr = d_preprocessor.get_preprocessed_data()
# For XGBoost
d_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
d_preprocessor.split_and_plot_data(test_size=0.2, plot=False)
d_preprocessor.normalize_data(scaler_type='MinMax', plot=False)
d_preprocessor.normalize_target(scaler_type='MinMax', plot=False)
X_train_xgb, y_train_xgb, X_test_xgb, y_test_xgb = d_preprocessor.get_preprocessed_data()
2023-10-02 12:56:12,723 [INFO] - Initializing TimeSeriesAnalysis class 2023-10-02 12:56:12,726 [INFO] - Initializing DataPreprocessor... 2023-10-02 12:56:12,727 [INFO] - Splitting data... 2023-10-02 12:56:12,735 [INFO] - Data split completed. X_train shape: (3047, 5), y_train shape: (3047,)
X_train shape: (3047, 5), y_train shape: (3047,) X_test shape: (762, 5), y_test shape: (762,)
2023-10-02 12:56:13,133 [INFO] - Normalizing feature data... 2023-10-02 12:56:13,135 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,137 [INFO] - Feature data normalization completed. 2023-10-02 12:56:13,143 [INFO] - Normalizing target data... 2023-10-02 12:56:13,143 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,143 [INFO] - Target data normalization completed. 2023-10-02 12:56:13,143 [INFO] - Preparing data for recurrent models with 10 timesteps... 2023-10-02 12:56:13,150 [INFO] - Generated 3037 sequences of shape (10, 5). 2023-10-02 12:56:13,158 [INFO] - Generated 752 sequences of shape (10, 5). 2023-10-02 12:56:13,158 [INFO] - Reshaping data for recurrent models... 2023-10-02 12:56:13,158 [INFO] - Data reshaped to (3037, 10, 5). 2023-10-02 12:56:13,164 [INFO] - Reshaping data for recurrent models... 2023-10-02 12:56:13,167 [INFO] - Data reshaped to (752, 10, 5). 2023-10-02 12:56:13,168 [INFO] - Data preparation for recurrent models completed. 2023-10-02 12:56:13,169 [INFO] - Initializing DataPreprocessor... 2023-10-02 12:56:13,171 [INFO] - Splitting data... 2023-10-02 12:56:13,175 [INFO] - Data split completed. X_train shape: (3047, 5), y_train shape: (3047,) 2023-10-02 12:56:13,176 [INFO] - Normalizing feature data... 2023-10-02 12:56:13,178 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,185 [INFO] - Feature data normalization completed. 2023-10-02 12:56:13,186 [INFO] - Normalizing target data... 2023-10-02 12:56:13,187 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,190 [INFO] - Target data normalization completed. 2023-10-02 12:56:13,191 [INFO] - Preparing data for recurrent models with 10 timesteps... 2023-10-02 12:56:13,197 [INFO] - Generated 3037 sequences of shape (10, 5). 2023-10-02 12:56:13,200 [INFO] - Generated 752 sequences of shape (10, 5). 2023-10-02 12:56:13,207 [INFO] - Reshaping data for recurrent models... 2023-10-02 12:56:13,209 [INFO] - Data reshaped to (3037, 10, 5). 2023-10-02 12:56:13,210 [INFO] - Reshaping data for recurrent models... 2023-10-02 12:56:13,211 [INFO] - Data reshaped to (752, 10, 5). 2023-10-02 12:56:13,214 [INFO] - Data preparation for recurrent models completed. 2023-10-02 12:56:13,216 [INFO] - Initializing DataPreprocessor... 2023-10-02 12:56:13,216 [INFO] - Splitting data... 2023-10-02 12:56:13,219 [INFO] - Data split completed. X_train shape: (3047, 5), y_train shape: (3047,) 2023-10-02 12:56:13,219 [INFO] - Normalizing feature data... 2023-10-02 12:56:13,219 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,219 [INFO] - Feature data normalization completed. 2023-10-02 12:56:13,219 [INFO] - Normalizing target data... 2023-10-02 12:56:13,219 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,219 [INFO] - Target data normalization completed. 2023-10-02 12:56:13,234 [INFO] - Initializing DataPreprocessor... 2023-10-02 12:56:13,234 [INFO] - Splitting data... 2023-10-02 12:56:13,234 [INFO] - Data split completed. X_train shape: (3047, 5), y_train shape: (3047,) 2023-10-02 12:56:13,234 [INFO] - Normalizing feature data... 2023-10-02 12:56:13,234 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,252 [INFO] - Feature data normalization completed. 2023-10-02 12:56:13,254 [INFO] - Normalizing target data... 2023-10-02 12:56:13,256 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,258 [INFO] - Target data normalization completed.
X_train shape: (3047, 5), y_train shape: (3047,) X_test shape: (762, 5), y_test shape: (762,) X_train shape: (3047, 5), y_train shape: (3047,) X_test shape: (762, 5), y_test shape: (762,) X_train shape: (3047, 5), y_train shape: (3047,) X_test shape: (762, 5), y_test shape: (762,)
print("LSTM Sequence-to-One Data Shapes:")
print("X_train_seq:", X_train_seq.shape)
print("y_train_seq:", y_train_seq.shape)
print("X_test_seq:", X_test_seq.shape)
print("y_test_seq:", y_test_seq.shape)
print("----")
print("LSTM Sequence-to-Sequence Data Shapes:")
print("X_train_seq:", X_train_seq1.shape)
print("y_train_seq:", y_train_seq1.shape)
print("X_test_seq:", X_test_seq1.shape)
print("y_test_seq:", y_test_seq1.shape)
print("----")
print("LR Data Shapes:")
print("X_train_lr:", X_train_lr.shape)
print("y_train_lr:", y_train_lr.shape)
print("X_test_lr:", X_test_lr.shape)
print("y_test_lr:", y_test_lr.shape)
print("----")
print("XGB Data Shapes:")
print("X_train_xgb:", X_train_xgb.shape)
print("y_train_xgb:", y_train_xgb.shape)
print("X_test_xgb:", X_test_xgb.shape)
print("y_test_xgb:", y_test_xgb.shape)
print("----")
LSTM Sequence-to-One Data Shapes: X_train_seq: (3037, 10, 5) y_train_seq: (3037, 1) X_test_seq: (752, 10, 5) y_test_seq: (752, 1) ---- LSTM Sequence-to-Sequence Data Shapes: X_train_seq: (3037, 10, 5) y_train_seq: (3037, 10, 1) X_test_seq: (752, 10, 5) y_test_seq: (752, 10, 1) ---- LR Data Shapes: X_train_lr: (3047, 5) y_train_lr: (3047, 1) X_test_lr: (762, 5) y_test_lr: (762, 1) ---- XGB Data Shapes: X_train_xgb: (3047, 5) y_train_xgb: (3047, 1) X_test_xgb: (762, 5) y_test_xgb: (762, 1) ----
class BaseModel:
"""
A base class for machine learning models.
This class handles data preprocessing, model training, predictions, and evaluations.
- Linear Regression
- XGBoost
- LightGBM
- KNN
- SVM
- Random Forest
"""
def __init__(self, data_preprocessor, config, plot=True):
self._validate_input(data_preprocessor.X_train, data_preprocessor.y_train, data_preprocessor.X_test, data_preprocessor.y_test)
self.X_train = data_preprocessor.X_train
self.y_train = data_preprocessor.y_train
self.X_test = data_preprocessor.X_test
self.y_test = data_preprocessor.y_test
self.feature_scaler = data_preprocessor.scalers['features']
self.target_scaler = data_preprocessor.scalers['target']
self.data = data_preprocessor.data
self.config = config
self.plot = plot
self.logger = logging.getLogger(__name__)
def _validate_input(self, X_train, y_train, X_test, y_test):
"""Validate the shape and type of training and testing data."""
for arr, name in [(X_train, 'X_train'), (y_train, 'y_train'), (X_test, 'X_test'), (y_test, 'y_test')]:
if not isinstance(arr, np.ndarray) or len(arr.shape) != 2:
raise ValueError(f"{name} should be a 2D numpy array.")
def inverse_scale_predictions(self):
""" Inverse and unscale the predicstion back to their original shape"""
try:
self.train_predictions = self.target_scaler.inverse_transform(self.train_predictions.reshape(-1, 1)).flatten()
self.test_predictions = self.target_scaler.inverse_transform(self.test_predictions.reshape(-1, 1)).flatten()
self.logger.info("Predictions inverse transformed to original scale")
except Exception as e:
self.logger.error(f"Error occurred while inverse transforming predictions: {str(e)}")
def compare_predictions(self):
"""Create dataframes comparing the original and predicted values for both training and test sets."""
try:
train_indices = self.data['Close'].iloc[:len(self.y_train)].values
test_indices = self.data['Close'].iloc[-len(self.y_test):].values
train_comparison_df = pd.DataFrame({'Original': train_indices, 'Predicted': self.train_predictions.ravel()})
test_comparison_df = pd.DataFrame({'Original': test_indices, 'Predicted': self.test_predictions.ravel()})
train_date_index = self.data.index[:len(self.y_train)]
test_date_index = self.data.index[-len(self.y_test):]
train_comparison_df.set_index(train_date_index, inplace=True)
test_comparison_df.set_index(test_date_index, inplace=True)
self.logger.info("Comparison dataframes generated")
return train_comparison_df, test_comparison_df
except Exception as e:
self.logger.error(f"Error occurred while creating comparison dataframes: {str(e)}")
def evaluate_model(self):
"""Evaluate the model using various metrics for both training and test sets."""
try:
train_comparison_df, test_comparison_df = self.compare_predictions()
metrics = {
'RMSE': lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
'R2 Score': r2_score,
'MAE': mean_absolute_error,
'Explained Variance': explained_variance_score
}
results = []
for dataset, comparison_df in [('Train', train_comparison_df), ('Test', test_comparison_df)]:
dataset_results = {metric_name: metric_func(comparison_df['Original'], comparison_df['Predicted']) for metric_name, metric_func in metrics.items()}
results.append(dataset_results)
results_df = pd.DataFrame(results, index=['Train', 'Test'])
return results_df
except Exception as e:
self.logger.error(f"Error occurred while evaluating the model: {str(e)}")
@staticmethod
def update_config_hash_mapping(config_hash, config, folder_name="models_assets"):
mapping_file_path = os.path.join(folder_name, 'config_hash_mapping.json')
if os.path.exists(mapping_file_path):
with open(mapping_file_path, 'r') as f:
existing_mappings = json.load(f)
else:
existing_mappings = {}
existing_mappings[config_hash] = config
# Save updated mappings
with open(mapping_file_path, 'w') as f:
json.dump(existing_mappings, f, indent=4)
def save_model_to_folder(self, version, folder_name="models_assets"):
model_name = self.__class__.__name__[9:] # Remove 'Enhanced_' from the class name
config_str = json.dumps(self.config, sort_keys=True)
config_hash = hashlib.md5(config_str.encode()).hexdigest()[:6]
if not os.path.exists(folder_name):
os.makedirs(folder_name)
BaseModel.update_config_hash_mapping(config_hash, self.config, folder_name)
# Save the model
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{model_name}_V{version}_{config_hash}_{timestamp}.joblib"
full_path = os.path.join(folder_name, filename)
dump(self.model, full_path)
self.logger.info(f"Model saved to {full_path}")
def plot_predictions(self):
"""Plot the original vs predicted values for both training and testing data."""
if not self.plot:
return
train_comparison_df, test_comparison_df = self.compare_predictions()
train_comparison_df.index = pd.to_datetime(train_comparison_df.index)
test_comparison_df.index = pd.to_datetime(test_comparison_df.index)
source_train = ColumnDataSource(data=dict(
date=train_comparison_df.index,
original=train_comparison_df['Original'],
predicted=train_comparison_df['Predicted']
))
source_test = ColumnDataSource(data=dict(
date=test_comparison_df.index,
original=test_comparison_df['Original'],
predicted=test_comparison_df['Predicted']
))
p1 = figure(width=700, height=600, x_axis_type="datetime", title="Training Data: Actual vs Predicted")
p1.line('date', 'original', legend_label="Actual", line_alpha=0.6, source=source_train)
p1.line('date', 'predicted', legend_label="Predicted", line_color="red", line_dash="dashed", source=source_train)
p1.legend.location = "top_left"
p2 = figure(width=700, height=600, x_axis_type="datetime", title="Testing Data: Actual vs Predicted")
p2.line('date', 'original', legend_label="Actual", line_alpha=0.6, source=source_test)
p2.line('date', 'predicted', legend_label="Predicted", line_color="red", line_dash="dashed", source=source_test)
p2.legend.location = "top_left"
hover1 = HoverTool()
hover1.tooltips = [
("Date", "@date{%F}"),
("Actual Value", "@original{0,0.0000}"),
("Predicted Value", "@predicted{0,0.0000}")
]
hover1.formatters = {"@date": "datetime"}
p1.add_tools(hover1)
hover2 = HoverTool()
hover2.tooltips = [
("Date", "@date{%F}"),
("Actual Value", "@original{0,0.0000}"),
("Predicted Value", "@predicted{0,0.0000}")
]
hover2.formatters = {"@date": "datetime"}
p2.add_tools(hover2)
# Show plots
show(row(p1, p2))
class Enhanced_Linear_Regression(BaseModel):
"""
Initialize the Enhanced_Linear_Regression model.
Supports Ridge and Lasso regularization.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""Choose the regression model based on the configuration."""
if self.config['regularization'] == 'ridge':
self.model = Ridge(alpha=self.config['alpha'])
self.logger.info("Ridge regression model initialized.")
elif self.config['regularization'] == 'lasso':
self.model = Lasso(alpha=self.config['alpha'])
self.logger.info("Lasso regression model initialized.")
else:
self.model = LinearRegression()
self.logger.info("Plain Linear Regression model initialized.")
def train_model(self):
"""Train the Linear Regression model."""
try:
self.model.fit(self.X_train, self.y_train)
self.logger.info("Linear Regression model trained successfully.")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model for training and test sets."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully.")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_XGBoost(BaseModel):
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""Initialize the XGBoost model based on the configuration."""
self.model = xgb.XGBRegressor(**self.config)
self.logger.info("XGBoost model initialized.")
def train_model(self):
"""Train the XGBoost model."""
try:
self.model.fit(self.X_train, self.y_train)
self.logger.info("XGBoost model trained successfully")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model for training and test sets."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully for both training and test data")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_LightGBM(BaseModel):
"""
Initialize the Enhanced LightGBM model.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""Initialize the LightGBM model based on the configuration."""
self.model = LGBMRegressor(**self.config)
self.logger.info("LightGBM model initialized.")
def train_model(self):
try:
self.model.fit(self.X_train, self.y_train)
self.logger.info("LightGBM model trained successfully")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model for training and test sets."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully for both training and test data")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_SVM(BaseModel):
"""
Initialize the Enhanced SVM model.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""Initialize the SVM model based on the configuration."""
self.model = SVR(**self.config)
self.logger.info("SVM model initialized.")
def train_model(self):
"""Train the SVM model."""
try:
self.model.fit(self.X_train, self.y_train.ravel()) # ravel() to convert y_train to 1D for SVM
self.logger.info("SVM model trained successfully.")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully.")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_KNN(BaseModel):
"""
Initialize the Enhanced KNN model.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""
Initialize the KNN model based on the configuration.
"""
self.model = KNeighborsRegressor(**self.config)
self.logger.info("KNN model initialized.")
def train_model(self):
"""
Train the KNN model.
"""
try:
self.model.fit(self.X_train, self.y_train.ravel()) # ravel() to convert y_train to 1D for KNN
self.logger.info("KNN model trained successfully.")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully.")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_RandomForest(BaseModel):
"""
A class for an enhanced Random Forest Regression model.
Inherits from the BaseModel class.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""
Initialize the Random Forest model based on the configuration.
"""
self.model = RandomForestRegressor(**self.config)
self.logger.info("Random Forest model initialized.")
def feature_importance(self):
"""
Extract feature importance scores.
"""
try:
importance_scores = self.model.feature_importances_
self.logger.info("Feature importance scores extracted.")
return importance_scores
except Exception as e:
self.logger.error(f"Error occurred while extracting feature importance: {str(e)}")
def train_model(self):
"""Make predictions using the trained model for training and test sets."""
try:
self.model.fit(self.X_train, self.y_train.ravel()) # Using ravel() to fit the expected shape
self.logger.info("RandomForest model trained successfully")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully.")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_SVR(BaseModel):
"""
Initialize the Enhanced SVR model.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""Initialize the SVR model based on the configuration."""
self.model = SVR(**self.config)
self.logger.info("SVR model initialized.")
def train_model(self):
"""Train the model."""
try:
self.model.fit(self.X_train, self.y_train.ravel()) # Using ravel() to fit the expected shape for some models
self.logger.info(f"{self.__class__.__name__} model trained successfully.")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully.")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
class Enhanced_ExtraTrees(BaseModel):
"""
Initialize the Enhanced Extra Trees model.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
"""Initialize the Extra Trees model based on the configuration."""
self.model = ExtraTreesRegressor(**self.config)
self.logger.info("Extra Trees model initialized.")
def train_model(self):
"""Train the model."""
try:
self.model.fit(self.X_train, self.y_train.ravel()) # Using ravel() to fit the expected shape for some models
self.logger.info(f"{self.__class__.__name__} model trained successfully.")
except Exception as e:
self.logger.error(f"Error occurred while training the model: {str(e)}")
def make_predictions(self):
"""Make predictions using the trained model."""
try:
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
self.logger.info("Predictions made successfully.")
except Exception as e:
self.logger.error(f"Error occurred while making predictions: {str(e)}")
data_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
data_preprocessor.split_and_plot_data(test_size=0.2, plot=False)
data_preprocessor.normalize_data(scaler_type='MinMax', plot=False)
data_preprocessor.normalize_target(scaler_type='MinMax', plot=False)
config = {
'regularization': 'ridge', # 'ridge', 'lasso', or None for plain Linear Regression
'alpha': 1.0 # regularization strength
}
model = Enhanced_Linear_Regression(data_preprocessor, config, plot=True)
model.train_model()
model.make_predictions()
model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = model.evaluate_model()
display(evaluation_results)
model.plot_predictions() # This will obey the plotting=True/False flag set during initialization
model.save_model_to_folder(version="final")
2023-10-02 12:56:13,507 [INFO] - Initializing DataPreprocessor... 2023-10-02 12:56:13,509 [INFO] - Splitting data... 2023-10-02 12:56:13,510 [INFO] - Data split completed. X_train shape: (3047, 5), y_train shape: (3047,) 2023-10-02 12:56:13,510 [INFO] - Normalizing feature data... 2023-10-02 12:56:13,510 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,510 [INFO] - Feature data normalization completed. 2023-10-02 12:56:13,510 [INFO] - Normalizing target data... 2023-10-02 12:56:13,526 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:13,530 [INFO] - Target data normalization completed. 2023-10-02 12:56:13,532 [INFO] - Ridge regression model initialized. 2023-10-02 12:56:13,536 [INFO] - Linear Regression model trained successfully. 2023-10-02 12:56:13,539 [INFO] - Predictions made successfully. 2023-10-02 12:56:13,542 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:13,544 [INFO] - Comparison dataframes generated 2023-10-02 12:56:13,549 [INFO] - Comparison dataframes generated
X_train shape: (3047, 5), y_train shape: (3047,) X_test shape: (762, 5), y_test shape: (762,)
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 286.024 | 0.999 | 134.240 | 0.999 |
| Test | 555.414 | 0.998 | 423.149 | 0.998 |
2023-10-02 12:56:13,563 [INFO] - Comparison dataframes generated
2023-10-02 12:56:14,048 [INFO] - Model saved to models_assets\Linear_Regression_Vfinal_909612_20231002_125614.joblib
xgb_config = {
'objective': 'reg:squarederror',
'learning_rate': 0.1,
'n_estimators': 100,
'max_depth': 5
}
xgb_model = Enhanced_XGBoost(data_preprocessor, xgb_config, plot=True)
xgb_model.train_model()
xgb_model.make_predictions()
xgb_model.inverse_scale_predictions() # Make sure this method reshapes the arrays
train_comparison_df, test_comparison_df = xgb_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = xgb_model.evaluate_model()
display(evaluation_results)
xgb_model.plot_predictions()
xgb_model.save_model_to_folder(version="final")
2023-10-02 12:56:14,061 [INFO] - XGBoost model initialized. 2023-10-02 12:56:14,415 [INFO] - XGBoost model trained successfully 2023-10-02 12:56:14,426 [INFO] - Predictions made successfully for both training and test data 2023-10-02 12:56:14,428 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:14,428 [INFO] - Comparison dataframes generated 2023-10-02 12:56:14,428 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 54.681 | 1.000 | 32.653 | 1.000 |
| Test | 832.921 | 0.995 | 620.550 | 0.997 |
2023-10-02 12:56:14,465 [INFO] - Comparison dataframes generated
2023-10-02 12:56:15,202 [INFO] - Model saved to models_assets\XGBoost_Vfinal_93ce58_20231002_125615.joblib
lgbm_config = {
'objective': 'regression',
'learning_rate': 0.1,
'n_estimators': 100,
'max_depth': 5
}
lgbm_model = Enhanced_LightGBM(data_preprocessor, lgbm_config, plot=True)
lgbm_model.train_model()
lgbm_model.make_predictions()
lgbm_model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = lgbm_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = lgbm_model.evaluate_model()
display(evaluation_results)
lgbm_model.plot_predictions()
lgbm_model.save_model_to_folder(version="final")
2023-10-02 12:56:15,218 [INFO] - LightGBM model initialized. 2023-10-02 12:56:15,341 [INFO] - LightGBM model trained successfully 2023-10-02 12:56:15,344 [INFO] - Predictions made successfully for both training and test data 2023-10-02 12:56:15,344 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:15,344 [INFO] - Comparison dataframes generated 2023-10-02 12:56:15,359 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 190.533 | 1.000 | 73.944 | 1.000 |
| Test | 1314.767 | 0.989 | 956.353 | 0.990 |
2023-10-02 12:56:15,387 [INFO] - Comparison dataframes generated
2023-10-02 12:56:15,960 [INFO] - Model saved to models_assets\LightGBM_Vfinal_847f0b_20231002_125615.joblib
svm_config = {
'kernel': 'rbf',
'C': 1.0,
'epsilon': 0.1
}
# Initialize Enhanced_SVM model
svm_model = Enhanced_SVM(data_preprocessor, svm_config, plot=True)
svm_model.train_model()
svm_model.make_predictions()
svm_model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = svm_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = svm_model.evaluate_model()
display(evaluation_results)
svm_model.plot_predictions()
svm_model.save_model_to_folder(version="final")
2023-10-02 12:56:15,968 [INFO] - SVM model initialized. 2023-10-02 12:56:15,984 [INFO] - SVM model trained successfully. 2023-10-02 12:56:16,008 [INFO] - Predictions made successfully. 2023-10-02 12:56:16,011 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:16,011 [INFO] - Comparison dataframes generated 2023-10-02 12:56:16,011 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 4909.593 | 0.835 | 4719.071 | 0.965 |
| Test | 4640.810 | 0.860 | 4342.444 | 0.881 |
2023-10-02 12:56:16,052 [INFO] - Comparison dataframes generated
2023-10-02 12:56:16,776 [INFO] - Model saved to models_assets\SVM_Vfinal_6de0eb_20231002_125616.joblib
svr_config = {
'kernel': 'rbf',
'C': 1.0,
'epsilon': 0.1
}
svr_model = Enhanced_SVR(data_preprocessor, svr_config, plot=True)
svr_model.train_model()
svr_model.make_predictions()
svr_model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = svr_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = svr_model.evaluate_model()
display(evaluation_results)
svr_model.plot_predictions()
svr_model.save_model_to_folder(version="final")
2023-10-02 12:56:16,797 [INFO] - SVR model initialized. 2023-10-02 12:56:16,802 [INFO] - Enhanced_SVR model trained successfully. 2023-10-02 12:56:16,802 [INFO] - Predictions made successfully. 2023-10-02 12:56:16,802 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:16,802 [INFO] - Comparison dataframes generated 2023-10-02 12:56:16,817 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 4909.593 | 0.835 | 4719.071 | 0.965 |
| Test | 4640.810 | 0.860 | 4342.444 | 0.881 |
2023-10-02 12:56:16,833 [INFO] - Comparison dataframes generated
2023-10-02 12:56:17,427 [INFO] - Model saved to models_assets\SVR_Vfinal_6de0eb_20231002_125617.joblib
knn_config = {
'n_neighbors': 5,
'weights': 'uniform',
'algorithm': 'auto'
}
# Initialize Enhanced_KNN model
knn_model = Enhanced_KNN(data_preprocessor, knn_config, plot=True)
knn_model.train_model()
knn_model.make_predictions()
knn_model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = knn_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = knn_model.evaluate_model()
display(evaluation_results)
knn_model.plot_predictions()
knn_model.save_model_to_folder(version="final")
2023-10-02 12:56:17,449 [INFO] - KNN model initialized. 2023-10-02 12:56:17,456 [INFO] - KNN model trained successfully. 2023-10-02 12:56:17,468 [INFO] - Predictions made successfully. 2023-10-02 12:56:17,468 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:17,468 [INFO] - Comparison dataframes generated 2023-10-02 12:56:17,468 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 216.949 | 1.000 | 81.503 | 1.000 |
| Test | 1059.873 | 0.993 | 823.640 | 0.993 |
2023-10-02 12:56:17,499 [INFO] - Comparison dataframes generated
2023-10-02 12:56:18,347 [INFO] - Model saved to models_assets\KNN_Vfinal_f67d60_20231002_125618.joblib
rf_config = {
'n_estimators': 100,
'criterion': 'poisson',
'max_depth': None
}
# Initialize Enhanced_RandomForest model
rf_model = Enhanced_RandomForest(data_preprocessor, rf_config, plot=True)
rf_model.train_model()
feature_importance_scores = rf_model.feature_importance()
rf_model.make_predictions()
rf_model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = rf_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = rf_model.evaluate_model()
display(evaluation_results)
rf_model.plot_predictions()
rf_model.save_model_to_folder(version="final")
2023-10-02 12:56:18,376 [INFO] - Random Forest model initialized. 2023-10-02 12:56:20,025 [INFO] - RandomForest model trained successfully 2023-10-02 12:56:20,049 [INFO] - Feature importance scores extracted. 2023-10-02 12:56:20,109 [INFO] - Predictions made successfully. 2023-10-02 12:56:20,109 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:20,109 [INFO] - Comparison dataframes generated 2023-10-02 12:56:20,122 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 51.065 | 1.000 | 20.630 | 1.000 |
| Test | 680.265 | 0.997 | 542.648 | 0.998 |
2023-10-02 12:56:20,139 [INFO] - Comparison dataframes generated
2023-10-02 12:56:20,840 [INFO] - Model saved to models_assets\RandomForest_Vfinal_cd5fc6_20231002_125620.joblib
extra_trees_config = {
'n_estimators': 100,
'criterion': 'mse',
'max_depth': None
}
extra_trees_model = Enhanced_ExtraTrees(data_preprocessor, extra_trees_config, plot=True)
extra_trees_model.train_model()
extra_trees_model.make_predictions()
extra_trees_model.inverse_scale_predictions()
train_comparison_df, test_comparison_df = extra_trees_model.compare_predictions()
#display(test_comparison_df)
#display(train_comparison_df)
evaluation_results = extra_trees_model.evaluate_model()
display(evaluation_results)
extra_trees_model.plot_predictions()
extra_trees_model.save_model_to_folder(version="final")
2023-10-02 12:56:20,843 [INFO] - Extra Trees model initialized. 2023-10-02 12:56:21,754 [INFO] - Enhanced_ExtraTrees model trained successfully. 2023-10-02 12:56:21,817 [INFO] - Predictions made successfully. 2023-10-02 12:56:21,817 [INFO] - Predictions inverse transformed to original scale 2023-10-02 12:56:21,817 [INFO] - Comparison dataframes generated 2023-10-02 12:56:21,832 [INFO] - Comparison dataframes generated
| RMSE | R2 Score | MAE | Explained Variance | |
|---|---|---|---|---|
| Train | 0.000 | 1.000 | 0.000 | 1.000 |
| Test | 674.384 | 0.997 | 568.834 | 0.999 |
2023-10-02 12:56:21,843 [INFO] - Comparison dataframes generated
2023-10-02 12:56:22,551 [INFO] - Model saved to models_assets\ExtraTrees_Vfinal_5f3128_20231002_125622.joblib
# LSTM Sequece-to-One
tsa = TimeSeriesAnalysis(df, target='Close')
data_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
data_preprocessor.split_and_plot_data(test_size=0.2, plot=False)
data_preprocessor.normalize_data(scaler_type='MinMax',plot=False)
data_preprocessor.normalize_target(scaler_type='MinMax',plot=False)
n_steps = 10
X_train_seq, y_train_seq, X_test_seq, y_test_seq = data_preprocessor.prepare_data_for_recurrent(n_steps, seq_to_seq=False)
print((data_preprocessor.X_train_seq).shape)
print((data_preprocessor.y_train_seq).shape)
print((data_preprocessor.X_test_seq).shape)
print((data_preprocessor.y_test_seq).shape)
print(hasattr(data_preprocessor, 'X_train_seq'))
2023-10-02 12:56:22,567 [INFO] - Initializing TimeSeriesAnalysis class 2023-10-02 12:56:22,583 [INFO] - Initializing DataPreprocessor... 2023-10-02 12:56:22,586 [INFO] - Splitting data... 2023-10-02 12:56:22,593 [INFO] - Data split completed. X_train shape: (3047, 5), y_train shape: (3047,) 2023-10-02 12:56:22,593 [INFO] - Normalizing feature data... 2023-10-02 12:56:22,593 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:22,608 [INFO] - Feature data normalization completed. 2023-10-02 12:56:22,608 [INFO] - Normalizing target data... 2023-10-02 12:56:22,608 [INFO] - Getting scaler of type: MinMax 2023-10-02 12:56:22,608 [INFO] - Target data normalization completed. 2023-10-02 12:56:22,608 [INFO] - Preparing data for recurrent models with 10 timesteps... 2023-10-02 12:56:22,624 [INFO] - Generated 3037 sequences of shape (10, 5). 2023-10-02 12:56:22,634 [INFO] - Generated 752 sequences of shape (10, 5). 2023-10-02 12:56:22,634 [INFO] - Reshaping data for recurrent models... 2023-10-02 12:56:22,634 [INFO] - Data reshaped to (3037, 10, 5). 2023-10-02 12:56:22,634 [INFO] - Reshaping data for recurrent models... 2023-10-02 12:56:22,634 [INFO] - Data reshaped to (752, 10, 5). 2023-10-02 12:56:22,634 [INFO] - Data preparation for recurrent models completed.
X_train shape: (3047, 5), y_train shape: (3047,) X_test shape: (762, 5), y_test shape: (762,) (3037, 10, 5) (3037, 1) (752, 10, 5) (752, 1) True
class BaseModelLSTM():
"""
A base class for LSTM-like machine learning models.
This class handles data preprocessing, model training, predictions, and evaluations.
"""
def __init__(self, model_type, data_preprocessor, config, cross_val=False):
self._validate_input_sequence(data_preprocessor.X_train_seq, data_preprocessor.y_train_seq, data_preprocessor.X_test_seq, data_preprocessor.y_test_seq)
self.X_train = data_preprocessor.X_train_seq
self.y_train = data_preprocessor.y_train_seq
self.X_test = data_preprocessor.X_test_seq
self.y_test = data_preprocessor.y_test_seq
self.feature_scaler = data_preprocessor.scalers['features']
self.target_scaler = data_preprocessor.scalers['target']
self.data = data_preprocessor.data
self.config = config
self.cross_val = cross_val
self.model_type = model_type
self.params = {'model_type': model_type}
self.params.update(config)
self._initialize_model()
self.logger = logging.getLogger(__name__)
def _initialize_model(self):
logging.info(f"Initializing {self.model_type} model")
self.model = Sequential()
if self.model_type in ['LSTM', 'GRU']:
for i, unit in enumerate(self.config['units']):
return_sequences = True if i < len(self.config['units']) - 1 else False
layer = LSTM(units=unit, return_sequences=return_sequences) if self.model_type == 'LSTM' else GRU(units=unit, return_sequences=return_sequences)
self.model.add(layer)
self.model.add(Dropout(self.config['dropout']))
elif self.model_type == 'CNN-LSTM':
self.model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=self.config['input_shape']))
self.model.add(Dropout(self.config['dropout']))
self.model.add(LSTM(units=self.config['units'][0]))
self.model.add(Dense(units=self.config['dense_units']))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
self.model.summary()
def _validate_input_sequence(self, X_train, y_train, X_test, y_test):
"""Validate the shape and type of training and testing sequence data."""
for arr, name in [(X_train, 'X_train_seq'), (y_train, 'y_train_seq'), (X_test, 'X_test_seq'), (y_test, 'y_test_seq')]:
if not isinstance(arr, np.ndarray):
raise ValueError(f"{name} should be a numpy array.")
if len(arr.shape) < 2:
raise ValueError(f"{name} should have at least two dimensions.")
# Special check for X_* arrays, which should be 3D for sequence models
if 'X_' in name and len(arr.shape) != 3:
raise ValueError(f"{name} should be a 3D numpy array for sequence models. Found shape {arr.shape}.")
def train_model(self, epochs=100, batch_size=50, early_stopping=True):
logging.info(f"Training {self.params['model_type']} model")
callbacks = [EarlyStopping(monitor='val_loss', patience=10)] if early_stopping else None
if self.cross_val:
tscv = TimeSeriesSplit(n_splits=5)
self.history = []
fold_no = 1
for train, val in tscv.split(self.X_train):
logging.info(f"Training on fold {fold_no}")
history = self.model.fit(self.X_train[train], self.y_train[train], epochs=epochs,
batch_size=batch_size, validation_data=(self.X_train[val], self.y_train[val]),
callbacks=callbacks, shuffle=False)
self.history.append(history)
logging.info(f"Done with fold {fold_no}")
self.model.summary()
fold_no += 1
else:
self.history = self.model.fit(self.X_train, self.y_train, epochs=epochs,
batch_size=batch_size, validation_split=0.2,
callbacks=callbacks, shuffle=False)
logging.info("Training completed")
self.model.summary()
def make_predictions(self):
logging.info("Making predictions")
self._make_raw_predictions()
self._make_unscaled_predictions()
self._create_comparison_dfs()
logging.info("Predictions made")
def _make_raw_predictions(self):
self.train_predictions = self.model.predict(self.X_train)
self.test_predictions = self.model.predict(self.X_test)
logging.info(f"Raw predictions made with shapes train: {self.train_predictions.shape}, test: {self.test_predictions.shape}")
def _make_unscaled_predictions(self):
# Check if the shape of the predictions matches that of y_train and y_test
if self.train_predictions.shape[:-1] != self.y_train.shape[:-1]:
logging.error(f"Shape mismatch: train_predictions {self.train_predictions.shape} vs y_train {self.y_train.shape}")
return
if self.test_predictions.shape[:-1] != self.y_test.shape[:-1]:
logging.error(f"Shape mismatch: test_predictions {self.test_predictions.shape} vs y_test {self.y_test.shape}")
return
# If predictions are 3D, reduce dimensionality by taking mean along last axis
if self.train_predictions.ndim == 3:
self.train_predictions = np.mean(self.train_predictions, axis=-1)
if self.test_predictions.ndim == 3:
self.test_predictions = np.mean(self.test_predictions, axis=-1)
# Perform the inverse transformation to get unscaled values
self.train_predictions = self.target_scaler.inverse_transform(self.train_predictions).flatten()
self.test_predictions = self.target_scaler.inverse_transform(self.test_predictions).flatten()
logging.info(f"Unscaled predictions made with shapes train: {self.train_predictions.shape}, test: {self.test_predictions.shape}")
def _create_comparison_dfs(self):
y_train_flat = self.target_scaler.inverse_transform(self.y_train).flatten()
y_test_flat = self.target_scaler.inverse_transform(self.y_test).flatten()
# Obtain date indices from original data
train_date_index = self.data.index[:len(self.y_train)]
test_date_index = self.data.index[-len(self.y_test):]
if y_train_flat.shape != self.train_predictions.shape:
logging.error(f"Shape mismatch between y_train {y_train_flat.shape} and train_predictions {self.train_predictions.shape}")
else:
self.train_comparison_df = pd.DataFrame({'Actual': y_train_flat, 'Predicted': self.train_predictions})
# Set date index for train_comparison_df
self.train_comparison_df.set_index(train_date_index, inplace=True)
if y_test_flat.shape != self.test_predictions.shape:
logging.error(f"Shape mismatch between y_test {y_test_flat.shape} and test_predictions {self.test_predictions.shape}")
else:
self.test_comparison_df = pd.DataFrame({'Actual': y_test_flat, 'Predicted': self.test_predictions})
# Set date index for test_comparison_df
self.test_comparison_df.set_index(test_date_index, inplace=True)
def evaluate_model(self):
logging.info("Evaluating LSTM model")
metrics = {'RMSE': mean_squared_error, 'R2 Score': r2_score,
'MAE': mean_absolute_error, 'Explained Variance': explained_variance_score}
evaluation = {}
for name, metric in metrics.items():
if name == 'RMSE':
train_evaluation = metric(self.train_comparison_df['Actual'],
self.train_comparison_df['Predicted'],
squared=False)
test_evaluation = metric(self.test_comparison_df['Actual'],
self.test_comparison_df['Predicted'],
squared=False)
else:
train_evaluation = metric(self.train_comparison_df['Actual'],
self.train_comparison_df['Predicted'])
test_evaluation = metric(self.test_comparison_df['Actual'],
self.test_comparison_df['Predicted'])
evaluation[name] = {'Train': train_evaluation, 'Test': test_evaluation}
self.evaluation_df = pd.DataFrame(evaluation)
logging.info("Evaluation completed")
return self.evaluation_df
def plot_history(self, plot=True):
if not plot:
return
if not hasattr(self, 'history'):
print("No training history is available. Train model first.")
return
# Extracting loss data from training history
train_loss = self.history.history['loss']
val_loss = self.history.history['val_loss']
epochs = list(range(1, len(train_loss) + 1))
# Preparing data
source = ColumnDataSource(data=dict(
epochs=epochs,
train_loss=train_loss,
val_loss=val_loss
))
p1 = figure(width=700, height=600, title="Training Loss over Epochs",x_axis_label='Epochs', y_axis_label='Loss')
hover1 = HoverTool()
hover1.tooltips = [("Epoch", "@epochs"), ("Loss", "@{train_loss}{0,0.0000}")]
p1.add_tools(hover1)
hover2 = HoverTool()
hover2.tooltips = [("Epoch", "@epochs"), ("Validation Loss", "@{val_loss}{0,0.0000}")]
p1.add_tools(hover2)
p1.line(x='epochs', y='train_loss', legend_label="Training Loss", line_width=2, source=source, color="green")
p1.line(x='epochs', y='val_loss', legend_label="Validation Loss", line_width=2, source=source, color="red")
p1.legend.location = "top_right"
p1.legend.click_policy = "hide"
output_notebook()
show(p1, notebook_handle=True)
def plot_predictions(self, plot=True):
if not plot:
return
if not hasattr(self, 'train_comparison_df') or not hasattr(self, 'test_comparison_df'):
print("No predictions are available. Generate predictions first.")
return
actual_train = self.train_comparison_df['Actual']
predicted_train = self.train_comparison_df['Predicted']
actual_test = self.test_comparison_df['Actual']
predicted_test = self.test_comparison_df['Predicted']
index_train = self.train_comparison_df.index
index_test = self.test_comparison_df.index
# Preparing data
source_train = ColumnDataSource(data=dict(
index=index_train,
actual_train=actual_train,
predicted_train=predicted_train
))
source_test = ColumnDataSource(data=dict(
index=index_test,
actual_test=actual_test,
predicted_test=predicted_test
))
p2 = figure(width=700, height=600, title="Training Data: Actual vs Predicted", x_axis_label='Date', y_axis_label='Value', x_axis_type="datetime")
p3 = figure(width=700, height=600, title="Testing Data: Actual vs Predicted",x_axis_label='Date', y_axis_label='Value', x_axis_type="datetime")
p2.line(x='index', y='actual_train', legend_label="Actual", line_width=2, source=source_train, color="green")
p2.line(x='index', y='predicted_train', legend_label="Predicted", line_width=2, source=source_train, color="red")
p3.line(x='index', y='actual_test', legend_label="Actual", line_width=2, source=source_test, color="green")
p3.line(x='index', y='predicted_test', legend_label="Predicted", line_width=2, source=source_test, color="red")
p2.legend.location = "top_left"
p2.legend.click_policy = "hide"
p3.legend.location = "top_left"
p3.legend.click_policy = "hide"
hover_train = HoverTool()
hover_train.tooltips = [
("Date", "@index{%F}"),
("Actual Value", "@{actual_train}{0,0.0000}"),
("Predicted Value", "@{predicted_train}{0,0.0000}")
]
hover_train.formatters = {"@index": "datetime"}
hover_test = HoverTool()
hover_test.tooltips = [
("Date", "@index{%F}"),
("Actual Value", "@{actual_test}{0,0.0000}"),
("Predicted Value", "@{predicted_test}{0,0.0000}")
]
hover_test.formatters = {"@index": "datetime"}
p2.add_tools(hover_train)
p3.add_tools(hover_test)
output_notebook()
show(row(p2, p3), notebook_handle=True)
@staticmethod
def update_config_hash_mapping(config_hash, config, folder_name="models_assets"):
"""
Update the configuration hash mapping.
Parameters:
config_hash (str): The MD5 hash of the configuration.
config (dict): The configuration dictionary.
folder_name (str): The name of the folder where models are saved.
"""
mapping_file_path = os.path.join(folder_name, 'config_hash_mapping.json')
if os.path.exists(mapping_file_path):
with open(mapping_file_path, 'r') as f:
existing_mappings = json.load(f)
else:
existing_mappings = {}
existing_mappings[config_hash] = config
# Save updated mappings
with open(mapping_file_path, 'w') as f:
json.dump(existing_mappings, f, indent=4)
def save_model_to_folder(self, version, folder_name="models_assets"):
"""
Save the model to a specified folder.
Parameters:
version (str): The version of the model.
folder_name (str): The name of the folder where models are saved.
"""
model_name = self.__class__.__name__ # Remove 'Enhanced_' from the class name if needed
config_str = json.dumps(self.config, sort_keys=True)
config_hash = hashlib.md5(config_str.encode()).hexdigest()[:6]
if not os.path.exists(folder_name):
os.makedirs(folder_name)
self.update_config_hash_mapping(config_hash, self.config, folder_name)
# Save the model
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"{model_name}_V{version}_{config_hash}_{timestamp}.h5"
full_path = os.path.join(folder_name, filename)
self.model.save(full_path)
print(f"Model saved to {full_path}")
class LSTMModel(BaseModelLSTM):
def _initialize_model(self):
self.model = Sequential()
additional_params = {
'input_shape': self.config['input_shape'],
'num_lstm_layers': self.config['num_lstm_layers'],
'lstm_units': self.config['lstm_units']
}
self.params.update(additional_params)
for i in range(self.config['num_lstm_layers']):
units = self.config['lstm_units'][i]
return_sequences = True if i < self.config['num_lstm_layers'] - 1 else False
self.model.add(LSTM(units, return_sequences=return_sequences))
self.model.add(Dropout(self.config['dropout']))
for units in self.config['dense_units']:
self.model.add(Dense(units))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class GRUModel(BaseModelLSTM):
def _initialize_model(self):
self.model = Sequential()
for i in range(self.config['num_gru_layers']):
units = self.config['gru_units'][i]
return_sequences = True if i < self.config['num_gru_layers'] - 1 else False
self.model.add(GRU(units, return_sequences=return_sequences))
self.model.add(Dropout(self.config['dropout']))
for units in self.config['dense_units']:
self.model.add(Dense(units))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class BiLSTMModel(BaseModelLSTM):
def _initialize_model(self):
self.model = Sequential()
for i in range(self.config['num_lstm_layers']):
units = self.config['lstm_units'][i]
return_sequences = True if i < self.config['num_lstm_layers'] - 1 else False
self.model.add(Bidirectional(LSTM(units, return_sequences=return_sequences)))
self.model.add(Dropout(self.config['dropout']))
for units in self.config['dense_units']:
self.model.add(Dense(units))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class StackedRNNModel(BaseModelLSTM):
def _initialize_model(self):
additional_params = {
'input_shape': self.config['input_shape'],
'lstm_units': self.config.get('lstm_units', []),
'gru_units': self.config.get('gru_units', [])
}
self.params.update(additional_params)
input_layer = Input(shape=self.config['input_shape'])
x = input_layer
# Adding LSTM layers
for i, units in enumerate(self.config['lstm_units']):
return_sequences = True if i < len(self.config['lstm_units']) - 1 or self.config['gru_units'] else False
x = LSTM(units, return_sequences=return_sequences)(x)
x = Dropout(self.config['dropout'])(x)
# Adding GRU layers
for i, units in enumerate(self.config['gru_units']):
return_sequences = True if i < len(self.config['gru_units']) - 1 else False
x = GRU(units, return_sequences=return_sequences)(x)
x = Dropout(self.config['dropout'])(x)
# Adding Dense layers
for units in self.config['dense_units']:
x = Dense(units)(x)
self.model = Model(inputs=input_layer, outputs=x)
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class AttentionLSTMModel(BaseModelLSTM):
"""
This class is an implementation of a LSTM model with Attention for sequence prediction.
It inherits from the BaseModelLSTM class and overrides the _initialize_model method.
"""
def _initialize_model(self):
additional_params = {
'input_shape': self.config['input_shape'],
'num_lstm_layers': self.config['num_lstm_layers'],
'lstm_units': self.config['lstm_units']
}
self.params.update(additional_params)
input_layer = Input(shape=self.config['input_shape'])
x = input_layer
# Add LSTM layers
for i in range(self.config['num_lstm_layers']):
units = self.config['lstm_units'][i]
return_sequences = True # For Attention, the last LSTM layer should also return sequences
x = LSTM(units, return_sequences=return_sequences)(x)
x = Dropout(self.config['dropout'])(x)
x = Attention(use_scale=True)([x, x]) # Self-attention
x = GlobalAveragePooling1D()(x)
for units in self.config['dense_units']:
x = Dense(units)(x)
self.model = Model(inputs=input_layer, outputs=x)
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class SimpleRNNModel(BaseModelLSTM):
def _initialize_model(self):
self.model = Sequential()
additional_params = {
'input_shape': self.config['input_shape'],
'num_rnn_layers': self.config['num_rnn_layers'],
'rnn_units': self.config['rnn_units']
}
self.params.update(additional_params)
for i in range(self.config['num_rnn_layers']):
units = self.config['rnn_units'][i]
return_sequences = True if i < self.config['num_rnn_layers'] - 1 else False
self.model.add(SimpleRNN(units, return_sequences=return_sequences))
self.model.add(Dropout(self.config['dropout']))
for units in self.config['dense_units']:
self.model.add(Dense(units))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class SimpleRNNModel(BaseModelLSTM):
"""
This class is an implementation of a Simple RNN model for sequence prediction.
It inherits from the BaseModelLSTM class and overrides the _initialize_model method.
"""
def _initialize_model(self):
self.model = Sequential()
additional_params = {
'input_shape': self.config['input_shape'],
'num_rnn_layers': self.config['num_rnn_layers'],
'rnn_units': self.config['rnn_units']
}
self.params.update(additional_params)
for i in range(self.config['num_rnn_layers']):
units = self.config['rnn_units'][i]
# Make sure to set return_sequences=False for the last layer
return_sequences = True if i < self.config['num_rnn_layers'] - 1 else False
self.model.add(SimpleRNN(units, return_sequences=return_sequences))
self.model.add(Dropout(self.config['dropout']))
# Add Dense layers
for units in self.config['dense_units']:
self.model.add(Dense(units))
# Compile the model
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class BiGRUModel(BaseModelLSTM):
"""
This class is an implementation of a bi-directional GRU model for sequence prediction.
It inherits from the BaseModelLSTM class and overrides the _initialize_model method.
"""
def _initialize_model(self):
self.model = Sequential()
additional_params = {
'input_shape': self.config['input_shape'],
'num_gru_layers': self.config['num_gru_layers'],
'gru_units': self.config['gru_units']
}
self.params.update(additional_params)
for i in range(self.config['num_gru_layers']):
units = self.config['gru_units'][i]
return_sequences = True if i < self.config['num_gru_layers'] - 1 else False
self.model.add(Bidirectional(GRU(units, return_sequences=return_sequences)))
self.model.add(Dropout(self.config['dropout']))
# If the last RNN layer returns sequences, you may need to flatten it
if return_sequences:
self.model.add(Flatten())
for units in self.config['dense_units']:
self.model.add(Dense(units))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
class CNNLSTMModel(BaseModelLSTM):
def _initialize_model(self):
self.model = Sequential()
# Conv1D layers
for i in range(self.config['num_conv_layers']):
filters = self.config['conv_filters'][i]
kernel_size = self.config['conv_kernel_size'][i]
if i == 0:
self.model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', input_shape=self.config['input_shape']))
else:
self.model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu'))
self.model.add(MaxPooling1D(pool_size=2))
self.model.add(GlobalMaxPooling1D())
self.model.add(Reshape((1, self.config['conv_filters'][-1])))
# LSTM layers
for i in range(self.config['num_lstm_layers']):
units = self.config['lstm_units'][i]
return_sequences = True if i < self.config['num_lstm_layers'] - 1 else False
self.model.add(LSTM(units, return_sequences=return_sequences))
self.model.add(Dropout(self.config['dropout']))
# Dense layers
for units in self.config['dense_units']:
self.model.add(Dense(units))
self.model.compile(optimizer=self.config['optimizer'], loss='mean_squared_error')
# LSTM Configuration
lstm_config = {
'input_shape': (10, 5), # Add your specific input shape here
'num_lstm_layers': 2,
'lstm_units': [50, 30],
'dropout': 0.2,
'dense_units': [1],
'optimizer': 'adam'
}
# Similarly, you can define configurations for GRU and CNN-LSTM
lstm_model = LSTMModel(data_preprocessor=data_preprocessor, config=lstm_config, model_type='LSTM')
lstm_model.train_model(epochs=100, batch_size=32)
lstm_model.make_predictions()
evaluation_df = lstm_model.evaluate_model()
print("LSTM Model Evaluation:\n", evaluation_df)
lstm_model.plot_history()
lstm_model.plot_predictions()
lstm_model.save_model_to_folder(version="1")
2023-10-02 12:56:24,358 [INFO] - Training LSTM model
Epoch 1/100 76/76 [==============================] - 8s 36ms/step - loss: 8.2664e-04 - val_loss: 0.0452 Epoch 2/100 76/76 [==============================] - 2s 25ms/step - loss: 7.7398e-04 - val_loss: 0.0029 Epoch 3/100 76/76 [==============================] - 2s 24ms/step - loss: 1.6871e-04 - val_loss: 0.0069 Epoch 4/100 76/76 [==============================] - 2s 24ms/step - loss: 2.4021e-04 - val_loss: 0.0028 Epoch 5/100 76/76 [==============================] - 2s 23ms/step - loss: 1.6592e-04 - val_loss: 0.0106 Epoch 6/100 76/76 [==============================] - 2s 21ms/step - loss: 2.5223e-04 - val_loss: 0.0045 Epoch 7/100 76/76 [==============================] - 2s 21ms/step - loss: 2.6051e-04 - val_loss: 0.0226 Epoch 8/100 76/76 [==============================] - 2s 22ms/step - loss: 3.8481e-04 - val_loss: 0.0131 Epoch 9/100 76/76 [==============================] - 2s 21ms/step - loss: 1.6735e-04 - val_loss: 0.0192 Epoch 10/100 76/76 [==============================] - 2s 22ms/step - loss: 2.5848e-04 - val_loss: 0.0118 Epoch 11/100 76/76 [==============================] - 2s 21ms/step - loss: 1.1162e-04 - val_loss: 0.0171 Epoch 12/100 76/76 [==============================] - 2s 21ms/step - loss: 2.1122e-04 - val_loss: 0.0100 Epoch 13/100 76/76 [==============================] - 2s 21ms/step - loss: 1.2604e-04 - val_loss: 0.0185 Epoch 14/100 76/76 [==============================] - 2s 22ms/step - loss: 1.8605e-04 - val_loss: 0.0100
2023-10-02 12:56:54,668 [INFO] - Training completed
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
lstm (LSTM) (None, 10, 50) 11200
dropout (Dropout) (None, 10, 50) 0
lstm_1 (LSTM) (None, 30) 9720
dropout_1 (Dropout) (None, 30) 0
dense (Dense) (None, 1) 31
=================================================================
Total params: 20,951
Trainable params: 20,951
Non-trainable params: 0
_________________________________________________________________
2023-10-02 12:56:54,695 [INFO] - Making predictions
95/95 [==============================] - 2s 8ms/step 24/24 [==============================] - 0s 7ms/step
2023-10-02 12:56:56,696 [INFO] - Raw predictions made with shapes train: (3037, 1), test: (752, 1) 2023-10-02 12:56:56,696 [INFO] - Unscaled predictions made with shapes train: (3037,), test: (752,) 2023-10-02 12:56:56,696 [INFO] - Predictions made 2023-10-02 12:56:56,709 [INFO] - Evaluating LSTM model 2023-10-02 12:56:56,716 [INFO] - Evaluation completed
LSTM Model Evaluation:
RMSE R2 Score MAE Explained Variance
Train 2894.048 0.943 1171.883 0.950
Test 6638.799 0.710 4813.794 0.857
Model saved to models_assets\LSTMModel_V1_84d4b5_20231002_125658.h5
# Configuration for BiLSTM
bi_lstm_config = {
'num_lstm_layers': 2, # Number of LSTM layers
'lstm_units': [50, 30], # Number of units for each LSTM layer
'dropout': 0.2, # Dropout rate
'dense_units': [1], # Number of units for the dense layer
'optimizer': 'adam' # Optimizer
}
# Initialize BiLSTM model
bi_lstm_model = BiLSTMModel(data_preprocessor=data_preprocessor, config=bi_lstm_config, model_type='BiLSTM')
bi_lstm_model.train_model(epochs=100, batch_size=32)
bi_lstm_model.make_predictions()
evaluation_df = bi_lstm_model.evaluate_model()
print("BiLSTM Model Evaluation:\n", evaluation_df)
bi_lstm_model.plot_history()
bi_lstm_model.plot_predictions()
bi_lstm_model.save_model_to_folder(version="1")
2023-10-02 12:56:58,466 [INFO] - Training BiLSTM model
Epoch 1/100 76/76 [==============================] - 12s 56ms/step - loss: 4.5912e-04 - val_loss: 0.0350 Epoch 2/100 76/76 [==============================] - 3s 34ms/step - loss: 0.0012 - val_loss: 0.0157 Epoch 3/100 76/76 [==============================] - 3s 34ms/step - loss: 2.5724e-04 - val_loss: 0.0079 Epoch 4/100 76/76 [==============================] - 3s 35ms/step - loss: 1.7587e-04 - val_loss: 0.0041 Epoch 5/100 76/76 [==============================] - 3s 34ms/step - loss: 1.2332e-04 - val_loss: 0.0115 Epoch 6/100 76/76 [==============================] - 3s 34ms/step - loss: 2.4075e-04 - val_loss: 0.0054 Epoch 7/100 76/76 [==============================] - 3s 34ms/step - loss: 1.7658e-04 - val_loss: 0.0160 Epoch 8/100 76/76 [==============================] - 3s 34ms/step - loss: 2.9397e-04 - val_loss: 0.0090 Epoch 9/100 76/76 [==============================] - 3s 35ms/step - loss: 1.9234e-04 - val_loss: 0.0214 Epoch 10/100 76/76 [==============================] - 3s 35ms/step - loss: 3.4391e-04 - val_loss: 0.0116 Epoch 11/100 76/76 [==============================] - 3s 34ms/step - loss: 1.7577e-04 - val_loss: 0.0226 Epoch 12/100 76/76 [==============================] - 3s 34ms/step - loss: 2.4315e-04 - val_loss: 0.0134 Epoch 13/100 76/76 [==============================] - 3s 35ms/step - loss: 9.2664e-05 - val_loss: 0.0193 Epoch 14/100 76/76 [==============================] - 3s 34ms/step - loss: 1.7162e-04 - val_loss: 0.0110
2023-10-02 12:57:44,340 [INFO] - Training completed
Model: "sequential_1"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
bidirectional (Bidirectiona (None, 10, 100) 22400
l)
dropout_2 (Dropout) (None, 10, 100) 0
bidirectional_1 (Bidirectio (None, 60) 31440
nal)
dropout_3 (Dropout) (None, 60) 0
dense_1 (Dense) (None, 1) 61
=================================================================
Total params: 53,901
Trainable params: 53,901
Non-trainable params: 0
_________________________________________________________________
2023-10-02 12:57:44,372 [INFO] - Making predictions
95/95 [==============================] - 3s 12ms/step 24/24 [==============================] - 0s 13ms/step
2023-10-02 12:57:47,481 [INFO] - Raw predictions made with shapes train: (3037, 1), test: (752, 1) 2023-10-02 12:57:47,481 [INFO] - Unscaled predictions made with shapes train: (3037,), test: (752,) 2023-10-02 12:57:47,481 [INFO] - Predictions made 2023-10-02 12:57:47,481 [INFO] - Evaluating LSTM model 2023-10-02 12:57:47,500 [INFO] - Evaluation completed
BiLSTM Model Evaluation:
RMSE R2 Score MAE Explained Variance
Train 3010.652 0.938 1107.518 0.944
Test 6755.977 0.699 4775.944 0.844
Model saved to models_assets\BiLSTMModel_V1_e0952f_20231002_125749.h5
# For GRU
gru_config = {
'num_gru_layers': 2,
'gru_units': [50, 30],
'dropout': 0.2,
'dense_units': [1],
'optimizer': 'adam'
}
# Similarly, you can define configurations for GRU and CNN-LSTM
gru_model = GRUModel(data_preprocessor=data_preprocessor, config=gru_config, model_type='GRU')
gru_model.train_model(epochs=100, batch_size=32)
gru_model.make_predictions()
evaluation_df = gru_model.evaluate_model()
print("LSTM Model Evaluation:\n", evaluation_df)
gru_model.plot_history()
gru_model.plot_predictions()
gru_model.save_model_to_folder(version="1")
2023-10-02 12:57:49,481 [INFO] - Training GRU model
Epoch 1/100 76/76 [==============================] - 6s 30ms/step - loss: 5.8076e-04 - val_loss: 0.0191 Epoch 2/100 76/76 [==============================] - 1s 19ms/step - loss: 0.0010 - val_loss: 0.0043 Epoch 3/100 76/76 [==============================] - 1s 19ms/step - loss: 2.4147e-04 - val_loss: 0.0036 Epoch 4/100 76/76 [==============================] - 2s 20ms/step - loss: 2.4987e-04 - val_loss: 7.4847e-04 Epoch 5/100 76/76 [==============================] - 1s 19ms/step - loss: 2.0603e-04 - val_loss: 0.0044 Epoch 6/100 76/76 [==============================] - 2s 22ms/step - loss: 2.6540e-04 - val_loss: 8.9519e-04 Epoch 7/100 76/76 [==============================] - 2s 22ms/step - loss: 2.8337e-04 - val_loss: 0.0067 Epoch 8/100 76/76 [==============================] - 2s 21ms/step - loss: 2.8798e-04 - val_loss: 9.0911e-04 Epoch 9/100 76/76 [==============================] - 2s 20ms/step - loss: 1.8208e-04 - val_loss: 0.0053 Epoch 10/100 76/76 [==============================] - 1s 19ms/step - loss: 2.5758e-04 - val_loss: 8.2534e-04 Epoch 11/100 76/76 [==============================] - 1s 19ms/step - loss: 1.6726e-04 - val_loss: 0.0053 Epoch 12/100 76/76 [==============================] - 1s 19ms/step - loss: 2.1802e-04 - val_loss: 7.2192e-04 Epoch 13/100 76/76 [==============================] - 1s 19ms/step - loss: 2.0023e-04 - val_loss: 0.0066 Epoch 14/100 76/76 [==============================] - 1s 20ms/step - loss: 2.7433e-04 - val_loss: 5.5605e-04 Epoch 15/100 76/76 [==============================] - 1s 19ms/step - loss: 1.6665e-04 - val_loss: 0.0055 Epoch 16/100 76/76 [==============================] - 1s 19ms/step - loss: 2.0118e-04 - val_loss: 5.3331e-04 Epoch 17/100 76/76 [==============================] - 1s 20ms/step - loss: 1.2527e-04 - val_loss: 0.0046 Epoch 18/100 76/76 [==============================] - 1s 19ms/step - loss: 1.7965e-04 - val_loss: 8.2663e-04 Epoch 19/100 76/76 [==============================] - 2s 20ms/step - loss: 1.3204e-04 - val_loss: 0.0055 Epoch 20/100 76/76 [==============================] - 2s 20ms/step - loss: 1.9593e-04 - val_loss: 6.2348e-04 Epoch 21/100 76/76 [==============================] - 1s 19ms/step - loss: 1.3802e-04 - val_loss: 0.0051 Epoch 22/100 76/76 [==============================] - 1s 20ms/step - loss: 1.8255e-04 - val_loss: 6.5262e-04 Epoch 23/100 76/76 [==============================] - 1s 19ms/step - loss: 1.3954e-04 - val_loss: 0.0064 Epoch 24/100 76/76 [==============================] - 1s 18ms/step - loss: 2.0052e-04 - val_loss: 6.1021e-04 Epoch 25/100 76/76 [==============================] - 1s 20ms/step - loss: 1.2981e-04 - val_loss: 0.0042 Epoch 26/100 76/76 [==============================] - 1s 19ms/step - loss: 1.6278e-04 - val_loss: 5.6323e-04
2023-10-02 12:58:33,551 [INFO] - Training completed
Model: "sequential_2"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
gru (GRU) (None, 10, 50) 8550
dropout_4 (Dropout) (None, 10, 50) 0
gru_1 (GRU) (None, 30) 7380
dropout_5 (Dropout) (None, 30) 0
dense_2 (Dense) (None, 1) 31
=================================================================
Total params: 15,961
Trainable params: 15,961
Non-trainable params: 0
_________________________________________________________________
2023-10-02 12:58:33,584 [INFO] - Making predictions
95/95 [==============================] - 1s 6ms/step 24/24 [==============================] - 0s 7ms/step
2023-10-02 12:58:35,254 [INFO] - Raw predictions made with shapes train: (3037, 1), test: (752, 1) 2023-10-02 12:58:35,256 [INFO] - Unscaled predictions made with shapes train: (3037,), test: (752,) 2023-10-02 12:58:35,259 [INFO] - Predictions made 2023-10-02 12:58:35,259 [INFO] - Evaluating LSTM model 2023-10-02 12:58:35,259 [INFO] - Evaluation completed
LSTM Model Evaluation:
RMSE R2 Score MAE Explained Variance
Train 715.436 0.996 374.370 0.997
Test 1546.835 0.984 982.743 0.986
Model saved to models_assets\GRUModel_V1_d5ae1c_20231002_125836.h5
bi_gru_config = {
'input_shape': (10, 30),
'num_gru_layers': 2,
'gru_units': [50, 30],
'dense_units': [1],
'dropout': 0.2,
'optimizer': 'adam'
}
bi_gru_model = BiGRUModel(data_preprocessor=data_preprocessor, config=bi_gru_config, model_type='BiGRU')
bi_gru_model.train_model(epochs=100, batch_size=32)
bi_gru_model.make_predictions()
evaluation_df = bi_gru_model.evaluate_model()
print("Stacked RNN Model Evaluation:\n", evaluation_df)
bi_gru_model.plot_history()
bi_gru_model.plot_predictions()
bi_gru_model.save_model_to_folder(version="1")
2023-10-02 12:58:36,756 [INFO] - Training BiGRU model
Epoch 1/100 76/76 [==============================] - 12s 48ms/step - loss: 5.3000e-04 - val_loss: 0.0039 Epoch 2/100 76/76 [==============================] - 2s 29ms/step - loss: 6.5128e-04 - val_loss: 0.0032 Epoch 3/100 76/76 [==============================] - 2s 30ms/step - loss: 2.7751e-04 - val_loss: 0.0012 Epoch 4/100 76/76 [==============================] - 2s 29ms/step - loss: 1.2925e-04 - val_loss: 0.0020 Epoch 5/100 76/76 [==============================] - 2s 30ms/step - loss: 2.1362e-04 - val_loss: 0.0016 Epoch 6/100 76/76 [==============================] - 2s 30ms/step - loss: 1.6007e-04 - val_loss: 0.0011 Epoch 7/100 76/76 [==============================] - 2s 30ms/step - loss: 1.1956e-04 - val_loss: 0.0020 Epoch 8/100 76/76 [==============================] - 2s 30ms/step - loss: 2.6089e-04 - val_loss: 0.0026 Epoch 9/100 76/76 [==============================] - 2s 29ms/step - loss: 3.5509e-04 - val_loss: 0.0018 Epoch 10/100 76/76 [==============================] - 2s 30ms/step - loss: 2.3375e-04 - val_loss: 7.1054e-04 Epoch 11/100 76/76 [==============================] - 2s 30ms/step - loss: 8.5147e-05 - val_loss: 6.2963e-04 Epoch 12/100 76/76 [==============================] - 2s 30ms/step - loss: 9.9613e-05 - val_loss: 6.1490e-04 Epoch 13/100 76/76 [==============================] - 2s 29ms/step - loss: 9.9575e-05 - val_loss: 5.6106e-04 Epoch 14/100 76/76 [==============================] - 2s 30ms/step - loss: 8.7411e-05 - val_loss: 5.7140e-04 Epoch 15/100 76/76 [==============================] - 2s 30ms/step - loss: 7.6792e-05 - val_loss: 4.1500e-04 Epoch 16/100 76/76 [==============================] - 2s 30ms/step - loss: 1.0982e-04 - val_loss: 5.9061e-04 Epoch 17/100 76/76 [==============================] - 2s 30ms/step - loss: 1.6075e-04 - val_loss: 6.6749e-04 Epoch 18/100 76/76 [==============================] - 2s 30ms/step - loss: 1.5500e-04 - val_loss: 7.5991e-04 Epoch 19/100 76/76 [==============================] - 2s 29ms/step - loss: 1.0649e-04 - val_loss: 8.1429e-04 Epoch 20/100 76/76 [==============================] - 2s 29ms/step - loss: 1.0379e-04 - val_loss: 0.0013 Epoch 21/100 76/76 [==============================] - 2s 30ms/step - loss: 8.7688e-05 - val_loss: 0.0011 Epoch 22/100 76/76 [==============================] - 2s 29ms/step - loss: 8.5194e-05 - val_loss: 0.0014 Epoch 23/100 76/76 [==============================] - 2s 30ms/step - loss: 8.4920e-05 - val_loss: 0.0011 Epoch 24/100 76/76 [==============================] - 2s 30ms/step - loss: 8.8361e-05 - val_loss: 0.0019 Epoch 25/100 76/76 [==============================] - 2s 30ms/step - loss: 1.0226e-04 - val_loss: 0.0027
2023-10-02 12:59:43,164 [INFO] - Training completed
Model: "sequential_3"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
bidirectional_2 (Bidirectio (None, 10, 100) 17100
nal)
dropout_6 (Dropout) (None, 10, 100) 0
bidirectional_3 (Bidirectio (None, 60) 23760
nal)
dropout_7 (Dropout) (None, 60) 0
dense_3 (Dense) (None, 1) 61
=================================================================
Total params: 40,921
Trainable params: 40,921
Non-trainable params: 0
_________________________________________________________________
2023-10-02 12:59:43,191 [INFO] - Making predictions
95/95 [==============================] - 3s 10ms/step 24/24 [==============================] - 0s 9ms/step
2023-10-02 12:59:46,343 [INFO] - Raw predictions made with shapes train: (3037, 1), test: (752, 1) 2023-10-02 12:59:46,343 [INFO] - Unscaled predictions made with shapes train: (3037,), test: (752,) 2023-10-02 12:59:46,343 [INFO] - Predictions made 2023-10-02 12:59:46,343 [INFO] - Evaluating LSTM model 2023-10-02 12:59:46,358 [INFO] - Evaluation completed
Stacked RNN Model Evaluation:
RMSE R2 Score MAE Explained Variance
Train 1700.143 0.980 1187.831 0.990
Test 3009.585 0.940 2261.224 0.974
Model saved to models_assets\BiGRUModel_V1_512ccd_20231002_125948.h5
simple_rnn_config = {
'input_shape': (10, 30),
'num_rnn_layers': 2,
'rnn_units': [50, 30],
'dense_units': [1],
'dropout': 0.2,
'optimizer': 'adam'
}
simple_rnn_model = SimpleRNNModel(data_preprocessor=data_preprocessor, config=simple_rnn_config, model_type='SimpleRNN')
simple_rnn_model.train_model(epochs=100, batch_size=32)
simple_rnn_model.make_predictions()
evaluation_df = simple_rnn_model.evaluate_model()
print("Stacked RNN Model Evaluation:\n", evaluation_df)
simple_rnn_model.plot_history()
simple_rnn_model.plot_predictions()
simple_rnn_model.save_model_to_folder(version="1")
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Cell In [47], line 10 1 simple_rnn_config = { 2 'input_shape': (10, 30), 3 'num_rnn_layers': 2, (...) 7 'optimizer': 'adam' 8 } ---> 10 simple_rnn_model = SimpleRNNModel(data_preprocessor=data_preprocessor, config=simple_rnn_config, model_type='SimpleRNN') 11 simple_rnn_model.train_model(epochs=100, batch_size=32) 12 simple_rnn_model.make_predictions() Cell In [39], line 20, in BaseModelLSTM.__init__(self, model_type, data_preprocessor, config, cross_val) 18 self.params = {'model_type': model_type} 19 self.params.update(config) ---> 20 self._initialize_model() 21 self.logger = logging.getLogger(__name__) Cell In [46], line 146, in SimpleRNNModel._initialize_model(self) 144 # Make sure to set return_sequences=False for the last layer 145 return_sequences = True if i < self.config['num_rnn_layers'] - 1 else False --> 146 self.model.add(SimpleRNN(units, return_sequences=return_sequences)) 147 self.model.add(Dropout(self.config['dropout'])) 149 # Add Dense layers NameError: name 'SimpleRNN' is not defined
stacked_rnn_config = {
'input_shape': (10, 5), # Add your specific input shape here
'lstm_units': [50, 30],
'gru_units': [20],
'dropout': 0.2,
'dense_units': [1],
'optimizer': 'adam'
}
stacked_rnn_model = StackedRNNModel(data_preprocessor=data_preprocessor, config=stacked_rnn_config, model_type='StackedRNN')
stacked_rnn_model.train_model(epochs=100, batch_size=32)
stacked_rnn_model.make_predictions()
evaluation_df = stacked_rnn_model.evaluate_model()
print("Stacked RNN Model Evaluation:\n", evaluation_df)
stacked_rnn_model.plot_history()
stacked_rnn_model.plot_predictions()
stacked_rnn_model.save_model_to_folder(version="1")
# Configuration for AttentionLSTM
attention_lstm_config = {
'input_shape': (10, 5), # Add your specific input shape here
'num_lstm_layers': 2,
'lstm_units': [50, 30],
'dropout': 0.2,
'dense_units': [1],
'optimizer': 'adam'
}
# Initialize AttentionLSTM model
attention_lstm_model = AttentionLSTMModel(data_preprocessor=data_preprocessor, config=attention_lstm_config, model_type='AttentionLSTM')
attention_lstm_model.train_model(epochs=100, batch_size=32)
attention_lstm_model.make_predictions()
evaluation_df = attention_lstm_model.evaluate_model()
print("AttentionLSTM Model Evaluation:\n", evaluation_df)
attention_lstm_model.plot_history()
attention_lstm_model.plot_predictions()
attention_lstm_model.save_model_to_folder(version="1")
"""
# For CNN-LSTM
cnn_lstm_config = {
'num_conv_layers': 1,
'conv_filters': [64],
'conv_kernel_size': [3],
'num_lstm_layers': 1,
'lstm_units': [50],
'dropout': 0.2,
'dense_units': [1],
'optimizer': 'adam'
}
cnn_lstm_model = CNNLSTMModel(data_preprocessor=data_preprocessor, config=cnn_lstm_config, model_type='CNN-LSTM')
cnn_lstm_model.train_model(epochs=100, batch_size=32)
cnn_lstm_model.make_predictions()
evaluation_df = cnn_lstm_model.evaluate_model()
print("CNN-LSTM Model Evaluation:\n", evaluation_df)
cnn_lstm_model.plot_history()
cnn_lstm_model.plot_predictions()
"""
stop
# LSTM Sequece-to-One
tsa = TimeSeriesAnalysis(df, target='Close')
data_preprocessor = UnifiedDataPreprocessor(df, target_column='Close')
data_preprocessor.split_and_plot_data(test_size=0.2, plot=False)
data_preprocessor.normalize_data(scaler_type='MinMax',plot=False)
data_preprocessor.normalize_target(scaler_type='MinMax',plot=False)
n_steps = 10
X_train_seq, y_train_seq, X_test_seq, y_test_seq = data_preprocessor.prepare_data_for_recurrent(n_steps, seq_to_seq=False)
print((data_preprocessor.X_train_seq).shape)
print((data_preprocessor.y_train_seq).shape)
print((data_preprocessor.X_test_seq).shape)
print((data_preprocessor.y_test_seq).shape)
print(hasattr(data_preprocessor, 'X_train_seq'))
class EnhancedLSTMHyperModel(HyperModel):
"""
A HyperModel subclass for building enhanced LSTM models for hyperparameter tuning.
Searchable Hyperparameters:
- num_layers (int): The number of LSTM layers (Range: 1 to 4).
- units (int): The number of units in each LSTM layer (Range: 32 to 256, step: 32).
- recurrent_dropout (float): The dropout rate for the recurrent units in each LSTM layer (Range: 0.0 to 0.5, step: 0.05).
- activation (str): The activation function to use in each LSTM layer. Can be 'tanh', 'sigmoid', or 'relu'.
- regularizer (str): The regularization to apply on the kernel weights matrix in each LSTM layer. Can be 'l1', 'l2', or 'l1_l2'.
- dropout (float): The dropout rate for Dropout layers after each LSTM layer (Range: 0.0 to 0.5, step: 0.05).
- dense_units (int): The number of units in the Dense layer (Range: 1 to 3).
- dense_activation (str): The activation function to use in the Dense layer. Can be 'relu', 'linear', 'sigmoid', or 'tanh'.
- learning_rate (float): The learning rate for the optimizer. Can be 1e-2, 1e-3, or 1e-4.
- optimizer (str): The optimizer to use. Can be 'adam', 'sgd', 'rmsprop', 'adagrad', 'adadelta', 'nadam', or 'ftrl'.
Hardcoded Parameters:
- Loss Function: 'mean_squared_error'
- Metrics: ['mean_absolute_error']
- Input Shape: Determined by the shape of training data.
Usage:
Initialize the hypermodel with data_preprocessor object containing preprocessed training and test data, and then use it for hyperparameter tuning using suitable tuners like Bayesian Optimization, Random Search, etc."""
def __init__(self, data_preprocessor):
self.input_shape = data_preprocessor.X_train_seq.shape[1:]
self.X_train = data_preprocessor.X_train_seq
self.y_train = data_preprocessor.y_train_seq
self.X_test = data_preprocessor.X_test_seq
self.y_test = data_preprocessor.y_test_seq
self.logger = logging.getLogger(__name__)
def build(self, hp):
logging.info("Building LSTM model...")
model = Sequential()
num_layers = hp.Int('num_layers', 1, 4)
logging.info(f"Setting up {num_layers} LSTM layers.")
for i in range(num_layers):
units = hp.Int('units_' + str(i), min_value=32, max_value=256, step=32)
logging.info(f"Layer {i+1}: Setting up LSTM with {units} units.")
if i == 0: # Only provide the input_shape for the first LSTM layer
model.add(LSTM(units=units,
recurrent_dropout=hp.Float('recurrent_dropout_'+str(i), min_value=0.0, max_value=0.5, step=0.05),
activation=hp.Choice('activation_'+str(i), ['tanh', 'sigmoid', 'relu']),
kernel_regularizer=hp.Choice('regularizer_' + str(i), ['l1', 'l2', 'l1_l2']),
return_sequences=True if i < num_layers - 1 else False,
input_shape=self.input_shape))
else:
model.add(LSTM(units=units,
recurrent_dropout=hp.Float('recurrent_dropout_'+str(i), min_value=0.0, max_value=0.5, step=0.05),
activation=hp.Choice('activation_'+str(i), ['tanh', 'sigmoid', 'relu']),
kernel_regularizer=hp.Choice('regularizer_' + str(i), ['l1', 'l2', 'l1_l2']),
return_sequences=True if i < num_layers - 1 else False))
model.add(BatchNormalization())
dropout_rate = hp.Float('dropout_'+str(i), min_value=0.0, max_value=0.5, step=0.05)
logging.info(f"Layer {i+1}: Setting up Dropout with rate {dropout_rate}.")
model.add(Dropout(rate=dropout_rate))
dense_units = hp.Int('dense_units', 1, 3)
logging.info(f"Setting up Dense layer with {dense_units} units.")
model.add(Dense(units=dense_units, activation=hp.Choice('dense_activation', ['relu', 'linear', 'sigmoid', 'tanh'])))
lr = hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4])
optimizer_choice = hp.Choice('optimizer', ['adam', 'sgd', 'rmsprop', 'adagrad', 'adadelta', 'nadam', 'ftrl'])
if optimizer_choice == 'adam':
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
elif optimizer_choice == 'sgd':
optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
elif optimizer_choice == 'rmsprop':
optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
elif optimizer_choice == 'adagrad':
optimizer = tf.keras.optimizers.Adagrad(learning_rate=lr)
elif optimizer_choice == 'adadelta':
optimizer = tf.keras.optimizers.Adadelta(learning_rate=lr)
elif optimizer_choice == 'nadam':
optimizer = tf.keras.optimizers.Nadam(learning_rate=lr)
elif optimizer_choice == 'ftrl':
optimizer = tf.keras.optimizers.Ftrl(learning_rate=lr)
logging.info(f"Compiling model with optimizer {optimizer_choice} and learning rate {lr}.")
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
logging.info("Model built successfully!")
return model
stop
hypermodel = EnhancedLSTMHyperModel(data_preprocessor=data_preprocessor)
tuner = BayesianOptimization(
hypermodel,
objective='val_loss',
max_trials=100,
directory='bayesian_optimization',
project_name='lstm'
)
# Callbacks
lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1, mode='min')
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)
# Start the search
tuner.search(hypermodel.X_train, hypermodel.y_train, epochs=25, validation_split=0.2, callbacks=[early_stopping_callback, lr_schedule])
best_model = tuner.get_best_models()[0]
best_hyperparameters = tuner.get_best_hyperparameters()[0]
# Display the best model's architecture
best_model.summary()
print(best_hyperparameters.values)
class Class2HyperModel(HyperModel):
"""
A HyperModel subclass for building sequence-based deep learning models for hyperparameter tuning.
Searchable Hyperparameters:
- num_layers (int): The number of sequence layers (Range: 1 to 4).
- layer_type (str): The type of sequence layer. Can be 'LSTM', 'GRU', 'BidirectionalLSTM', or 'BidirectionalGRU'.
- units (int): The number of units in a layer (Range: 32 to 256, step: 32).
- kernel_initializer (str): The initializer for the kernel weights matrix.
- recurrent_dropout (float): The dropout rate for the recurrent units (Range: 0.0 to 0.5, step: 0.05).
- activation (str): The activation function to use. Can be 'tanh', 'sigmoid', or 'relu'.
- momentum (float): Momentum for the moving average in BatchNormalization layer (Range: 0.0 to 1.0, step: 0.1).
- epsilon (float): Small float added to variance to avoid dividing by zero in BatchNormalization layer (Range: 1e-5 to 1e-2, step: 1e-5).
- dropout (float): The dropout rate for Dropout layers (Range: 0.0 to 0.5, step: 0.05).
- dense_units (int): The number of units in the Dense layer (Range: 1 to 3).
- dense_activation (str): The activation function to use in the Dense layer. Can be 'relu', 'linear', 'sigmoid', or 'tanh'.
- optimizer (str): The optimizer to use. Can be 'adam', 'sgd', 'rmsprop', 'adagrad', 'adadelta', 'nadam', or 'ftrl'.
- learning_rate (float): The learning rate for the optimizer. Can be 1e-2, 1e-3, or 1e-4.
Hardcoded Parameters:
- Loss Function: 'mean_squared_error'
- Metrics: ['mean_absolute_error']
- Input Shape: Determined by the shape of training data.
- The directory for Bayesian Optimization: 'bayesian_optimization'
- Project Name for Bayesian Optimization: 'final'
- Max Trials for Bayesian Optimization: 100
- Number of Epochs: 25
- Validation Split: 0.2
Callbacks:
- ReduceLROnPlateau: Reduces the learning rate when a metric has stopped improving.
- EarlyStopping: Stops training when a monitored metric has stopped improving.
"""
def __init__(self, data_preprocessor):
self.input_shape = data_preprocessor.X_train_seq.shape[1:]
self.X_train = data_preprocessor.X_train_seq
self.y_train = data_preprocessor.y_train_seq
self.X_test = data_preprocessor.X_test_seq
self.y_test = data_preprocessor.y_test_seq
self.logger = logging.getLogger(__name__)
def build(self, hp):
self.logger.info("Building sequence-based deep learning model...")
model = tf.keras.Sequential()
# Dictionary for Optimizers
optimizer_dict = {
'adam': tf.keras.optimizers.Adam,
'sgd': tf.keras.optimizers.SGD,
'rmsprop': tf.keras.optimizers.RMSprop,
'adagrad': tf.keras.optimizers.Adagrad,
'adadelta': tf.keras.optimizers.Adadelta,
'nadam': tf.keras.optimizers.Nadam,
'ftrl': tf.keras.optimizers.Ftrl
}
# Number of sequence layers
num_layers = hp.Int('num_layers', 1, 4)
last_layer_type = None
for i in range(num_layers):
# Type of sequence layer: LSTM, GRU, Bidirectional
layer_type = hp.Choice(f'layer_type_{i}', ['LSTM', 'GRU', 'BidirectionalLSTM', 'BidirectionalGRU'])
units = hp.Int(f'units_{i}', min_value=32, max_value=256, step=32)
kernel_initializer = hp.Choice(f'kernel_initializer_{i}', ['glorot_uniform', 'he_normal', 'lecun_normal'])
# Layer Parameters
layer_params = {
'units': units,
'kernel_initializer': kernel_initializer,
'recurrent_dropout': hp.Float(f'recurrent_dropout_{i}', min_value=0.0, max_value=0.5, step=0.05),
'activation': hp.Choice(f'activation_{i}', ['tanh', 'sigmoid', 'relu']),
'return_sequences': True if i < num_layers - 1 else False
}
if i == 0: # First layer needs to specify input shape
layer_params['input_shape'] = self.input_shape
# Add sequence layer based on choice
if 'Bidirectional' in layer_type:
rnn_layer_type = layer_type.split('Bidirectional')[1]
rnn_layer = LSTM(**layer_params) if rnn_layer_type == 'LSTM' else GRU(**layer_params)
model.add(Bidirectional(rnn_layer))
elif layer_type == 'LSTM':
model.add(LSTM(**layer_params))
elif layer_type == 'GRU':
model.add(GRU(**layer_params))
last_layer_type = layer_type
# Add BatchNormalization layer
model.add(BatchNormalization(
momentum=hp.Float(f'momentum_{i}', min_value=0.0, max_value=1.0, step=0.1),
epsilon=hp.Float(f'epsilon_{i}', min_value=1e-5, max_value=1e-2, step=1e-5)
))
# Add Dropout layer
model.add(Dropout(rate=hp.Float(f'dropout_{i}', min_value=0.0, max_value=0.5, step=0.05)))
# Add Dense layer
model.add(Dense(
units=hp.Int('dense_units', 1, 3),
activation=hp.Choice('dense_activation', ['relu', 'linear', 'sigmoid', 'tanh'])
))
# Compile the model
optimizer_choice = hp.Choice('optimizer', list(optimizer_dict.keys()))
optimizer = optimizer_dict[optimizer_choice](learning_rate=hp.Choice('learning_rate', [1e-2, 1e-3, 1e-4]))
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mean_absolute_error'])
return model
hypermodel = Class2HyperModel(data_preprocessor=data_preprocessor)
# Initialize the tuner
tuner = BayesianOptimization(
hypermodel,
objective='val_loss',
max_trials=100, # Feel free to adjust this based on computational capacity
directory='bayesian_optimization',
project_name='lstm_gru_bidirectional' # Updated project name to reflect the new model types
)
# Callbacks remain the same
lr_schedule = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, verbose=1, mode='min')
early_stopping_callback = EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='min', restore_best_weights=True)
tuner.search(hypermodel.X_train, hypermodel.y_train, epochs=25, validation_split=0.2, callbacks=[early_stopping_callback, lr_schedule])
best_model = tuner.get_best_models()[0]
best_hyperparameters = tuner.get_best_hyperparameters()[0]
# Display the best model's architecture
best_model.summary()
print(best_hyperparameters.values)
class BaseTimeSeriesModel(BaseModel):
"""
A base class for ARIMA-based models.
This class will handle the initialization of time series-specific properties and methods.
"""
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
# Initialize any time series-specific properties here
# Any common methods for ARIMA-based models
class Enhanced_ARIMA(BaseTimeSeriesModel):
def __init__(self, data_preprocessor, config, plot=True):
super().__init__(data_preprocessor, config, plot)
self._initialize_model()
def _initialize_model(self):
# Initialize the ARIMA model based on the configuration
pass
def train_model(self):
# Train the ARIMA model
pass
def make_predictions(self):
# Make predictions using the trained ARIMA model
pass
# Similar classes can be created for SARIMA and SARIMAX